package org.kepler.scia;

import java.io.IOException;
import java.io.StringReader;
import java.util.Hashtable;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;


/**
 * Compute the similarity of schema nodes in two schema trees based on description.
 * DescriptionMatcher takes care of refining descritions, indexing the source
 * schema tree, computing the similarity of each target node with each
 * source node and storing it in a hashtable, descSimTable.
 *
 * @author Guilian Wang based on the code by uchukmol
 */
public class DescriptionMatcher {
    /**
     * The constructor compute  the description similarity of schema nodes in two
     * input schema trees and store into a hashtable descSimTable.
     */
    public DescriptionMatcher(SchemaTree target, SchemaTree source,
        Hashtable descSimTable) throws IOException {
        computeTreesDescSim(target, source, descSimTable);

        if (SCIA.debug_on) {
            System.err.println("descSimTable = " + descSimTable);
        }
    }

    /**
     * @param desc: String: the description of a node and the resulting
     normalised description after the refinement
     * @param analyzer: FrenchAnalyzer: the kind of analyzer used to stem the
     description
     * @param stemming_field: a temporary field used by the analyzer to compute
     the stemming of the description
     * @param toIndex: boolean: this variable indicates if the description is to
     index or not
     * @throws IOException
     * @throws NullPointerException
     *
     * @author uchukmol
     *
     */
    public String descriptionRefine(String desc, Analyzer analyzer,
        String stemming_field) throws IOException, NullPointerException {
        String result = new String("");

        TokenStream stream = analyzer.tokenStream(stemming_field,
                new StringReader(desc));

        int k = 0; //stream token counter

        /* 0 means that the stream is empty and
           we are not meant to index an empty string -->
           error from lucene :-)
        */
        while (true) {
            Token token = stream.next();

            if (token == null) {
                break;
            }

            k++;

            result = (result.concat(" ")).concat((token.termText()));
        }

        if (k > 0) {
            TokenStream stream1 = analyzer.tokenStream(stemming_field,
                    new StringReader(result));

            result = new String("");

            while (true) {
                Token token = stream1.next();

                if (token == null) {
                    break;
                }

                result = (result.concat(" ")).concat(token.termText());
            }
        } else {
            result = "";
        }

        return result;
    }

    /**
     * Index a schema tree's description for searching when using query.
     * E.g., Searcher searcher = new IndexSearcher(index_dir)
     */
    public void indexSchemaTreeDesc(SchemaTree tree) throws IOException {
        Directory tree_index_dir = FSDirectory.getDirectory(SCIA.SCIA_WORK_DIR +
                "/TreeIndex", true);
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriter writer = new IndexWriter(tree_index_dir, analyzer, true);

        if (tree.rootNode != null) {
            indexDescForAll(tree.rootNode, writer, analyzer);
        }

        writer.close();
    }

    /**
      * Index the description of the input TNode
      */
    public void indexDesc(TNode node, IndexWriter writer, Analyzer analyzer)
        throws IOException {
        String str = node.getDescription();

        if (str != null) {
            //String description = descriptionRefine(str, analyzer, 
            //stemming_field);
            String description = descriptionRefine(str, analyzer,
                    "content_field");

            if (!description.equals("") || !description.equals(null)) {
                Document doc = new Document();
                doc.add(Field.Text("content_field", new String(description)));
                doc.add(Field.Keyword("path_field", new String(node.getPath())));
                writer.addDocument(doc);
            }
        }
    }

    /**
     * Index the input TNode's description and all its subtree nodes' description
     */
    public void indexDescForAll(TNode node, IndexWriter writer,
        Analyzer analyzer) throws IOException {
        indexDesc(node, writer, analyzer);

        if (node.children != null) {
            for (int i = 0; i < node.children.size(); i++) {
                TNode child = (TNode) node.children.get(i);

                if (child != null) {
                    indexDescForAll(child, writer, analyzer);
                }
            }
        }
    }

    /**
     * Compute the desciption similarity of each target node with each source target
     * node and store in the descSimTable
     */
    public void computeTreesDescSim(SchemaTree target, SchemaTree source,
        Hashtable descSimTable) throws IOException {
        Analyzer analyzer = new StandardAnalyzer();

        String path = SCIA.SCIA_WORK_DIR + "/individualIndex";
        Directory dir = FSDirectory.getDirectory(path, true);

        if ((target.rootNode != null) && (source.rootNode != null)) {
            computeDescSimForAll(target.rootNode, source.rootNode, analyzer,
                descSimTable, dir);
        }
    }

    /**
     * Compute the desciption similarity of a target node tNode with a source node
     * sNode and its subtree nodes, and alsoeach of tnode's subtree nodes with sNode
     * its subtree nodes, and store in the descSimTable
     */
    public void computeDescSimForAll(TNode tNode, TNode sNode,
        Analyzer analyzer, Hashtable descSimTable, Directory dir)
        throws IOException {
        // compute description similarity for tNode with sNode subtree nodes
        computeDescSim(tNode, sNode, analyzer, descSimTable, dir);

        //compute description similarity for each child with sNode subtree nodes
        if (tNode.children != null) {
            for (int i = 0; i < tNode.children.size(); i++) {
                TNode child = (TNode) tNode.children.get(i);

                if (child != null) {
                    computeDescSimForAll(child, sNode, analyzer, descSimTable,
                        dir);
                }
            }
        }
    }

    /**
     * compute description similarity for tNode with sNode and each of sNode's
     * subtree nodes
     *
     * @param tNode The target schema tree node
     * @param tNode The source schema tree node
     * @param dir The directory for individual index
     * @param analyzer The analyzer for analyzing text
     */
    public void computeDescSim(TNode tNode, TNode sNode, Analyzer analyzer,
        Hashtable descSimTable, Directory dir) throws IOException {
        if ((sNode == null) || (tNode == null)) {
            return;
        }

        String str_source = null;

        if (sNode.getDescription() != null) {
            str_source = descriptionRefine(sNode.getDescription(), analyzer,
                    "stemming_field");
        }

        String str_target = null;

        if (tNode.getDescription() != null) {
            str_target = descriptionRefine(tNode.getDescription(), analyzer,
                    "stemming_field");
        }

        String target_path = tNode.getPath();
        String source_path = sNode.getPath();
        double cosine = 0.0d;

        /*
        if(target_path.equalsIgnoreCase("/Message_swift") &&
           source_path.equalsIgnoreCase("/Interchange_EDI")) {
               if(SCIA.debug_on) System.err.println("***computeDescSimTable is called for " +
                                  target_path + " & " + source_path);
           }
        */
        if ((str_source != null) && !str_source.equals("") &&
                (str_target != null) && !str_target.equals("")) {
            IndexWriter indxWrt = new IndexWriter(dir, analyzer, true);
            Document doc1 = new Document();
            doc1.add(Field.Text("content1", new String(str_source), true));
            doc1.add(Field.Text("content2", new String(str_target), true));

            indxWrt.addDocument(doc1);
            indxWrt.close();

            IndexReader reader = IndexReader.open(dir);

            TermFreqVector v1 = reader.getTermFreqVector(0, "content1");
            TermFreqVector v2 = reader.getTermFreqVector(0, "content2");

            cosine = CompareDescVector.computeVectorCosine(v1, v2);

            reader.close();

            if (Double.isNaN(cosine)) {
                cosine = 0.0d;
            }

            /* to combine with other techniques, phrase query, fuzzy query, etc.,
               indexSchemaTreeDesc(sourceSchemaTree) would be needed in order to
               query over it. Sample code is in the main method where PhraseQuery
               is used
            */

            //put the result, right now only the cosine itself, into descSimTable
            String key = "(" + target_path + "," + source_path + ")";
            descSimTable.put(key, new Double(cosine));

            //System.out.println("key = " + key + "  cosine = " + cosine); 	   
        }

        if ((sNode.children != null) && !sNode.children.isEmpty()) {
            for (int i = 0; i < sNode.children.size(); i++) {
                computeDescSim(tNode, (TNode) sNode.children.get(i), analyzer,
                    descSimTable, dir);
            }
        }
    }

    public void test() throws Exception {
        Analyzer analyzer = new StandardAnalyzer();

        // indexing the target tree 
        Directory target_tree_dir = FSDirectory.getDirectory("f:/scia/index",
                true);
        IndexWriter writer = new IndexWriter(target_tree_dir, analyzer, true);

        //String target = "Peter Buneman recently joined the Division of Informatics at Edinburgh. His work in computer science has focussed mainly on databases and programming languages. He has worked on active databases, database semantics, database integration, approximate information, query languages, semistructured data � an area in which he has recently co-authored a book, data provenance and scientific databases. In addition, he has made contributions to graph theory and to the principles of phylogeny. He has served on numerous program committees, editorial boards and working groups, and has been program chair for ACM SIGMOD, ACM PODS and the International Conference on Database Theory. He is a fellow of the ACM.";
        String target = "In addition,  contributions to graph theory and to the principles of phylogeny";

        //String target = "Hi Rami, good afternoon! Let's make it work today.";
        //String target = "Rami, good afternoon!";
        String pathFromRoot = "rami";
        String target_path = "rami";

        String str_target = descriptionRefine(target, analyzer, "stemmingField");

        if (SCIA.debug_on) {
            System.err.println("str_target = " + str_target);
        }

        if (!str_target.equals("") && !str_target.equals(null)) {
            Document doc = new Document();
            doc.add(Field.Text("contentField", new String(str_target)));
            doc.add(Field.Keyword("pathField", new String(pathFromRoot)));
            writer.addDocument(doc);
        }

        writer.close();

        //String source = " How are you doing Rami?";
        //String source = "Rami, good afternoon! Let's finish it soon.";
        //String source = "Good afternoon, Rami!";
        String source = "In addition,  contributions to graph theory and to the principles of phylogeny";
        String str_source = descriptionRefine(source, analyzer, "stemmingField");
        String source_path = source;

        double cosine = 0.0d;

        Directory dir = FSDirectory.getDirectory("f:/scia/individualIndex", true);
        IndexWriter indxWrt = new IndexWriter(dir, analyzer, true);

        if (!str_source.equals(null) && !str_source.equals("") &&
                !str_target.equals(null) && !str_target.equals("")) {
            Document doc1 = new Document();
            doc1.add(Field.Text("content1", new String(str_source), true));
            doc1.add(Field.Text("content2", new String(str_target), true));

            indxWrt.addDocument(doc1);
            indxWrt.close();

            IndexReader reader = IndexReader.open(dir);

            TermFreqVector v1 = reader.getTermFreqVector(0, "content1");
            TermFreqVector v2 = reader.getTermFreqVector(0, "content2");

            cosine = CompareDescVector.computeVectorCosine(v1, v2);

            reader.close();
        }

        TokenStream stream = analyzer.tokenStream("contentField",
                new StringReader(str_source));

        Term term = new Term("contentField", str_source);

        if (SCIA.debug_on) {
            System.err.println("term = " + term);
        }

        //FuzzyQuery fq = new FuzzyQuery(term, 0.05f);
        PhraseQuery pq = new PhraseQuery();

        //the slop number to use in the phrase query, 
        //int k = 0; // it is tested, doesn't matter
        String query = "";

        while (true) {
            Token token = stream.next();

            if (token == null) {
                break;
            }

            query = (query + " " + token.termText());

            if (SCIA.debug_on) {
                System.err.print("[" + token.termText() + "] ");
            }

            pq.add(new Term("contentField", token.termText()));

            //k++;
        }

        if (SCIA.debug_on) {
            System.err.println("\nphrase query = " + pq);
        }

        if (SCIA.debug_on) {
            System.err.println("query = " + query);
        }

        //pq.setSlop(k);
        pq.setSlop(0);

        Searcher searcher = new IndexSearcher(target_tree_dir);

        Hits hits = searcher.search(pq);

        /*Hits hitF = searcher.search(fq);

        double fscore = 0.0d;
        try {
            if (hitF != null && hitF.length() > 0) {
                for (int x = 0; x <= hitF.length();x++){

                    Document doc = hitF.doc(x);
                    String path_name = doc.get("pathField");
                    if(SCIA.debug_on) System.err.println("path_name = " + path_name);

                    if (path_name.equals(target_path)){
                        fscore = hitF.score(x);
                        if(SCIA.debug_on) System.err.println("fuzzy hitted, fscore = " + fscore);
                    }
                }
            }
            if(SCIA.debug_on) System.err.println("fscore = " + fscore);

        } catch (IndexOutOfBoundsException e){
            e.printStackTrace();
        }
        */
        double score = 0.0d;

        try {
            for (int x = 0; x <= hits.length(); x++) {
                Document doc = hits.doc(x);
                String path_name = doc.get("pathField");

                if (SCIA.debug_on) {
                    System.err.println("path_name = " + path_name);
                }

                if (path_name.equals(target_path)) {
                    score = hits.score(x);

                    if (SCIA.debug_on) {
                        System.err.println("hitted, score = " + score);
                    }
                }
            }
        } catch (IndexOutOfBoundsException e) {
            //e.printStackTrace();
        }

        if (SCIA.debug_on) {
            System.err.println("score = " + score);
        }

        searcher.close();

        if (Double.isNaN(cosine)) {
            cosine = 0.0d;
        }

        if (SCIA.debug_on) {
            System.err.println("cosine = " + cosine);
        }

        double lpSim = LetterPairSimilarity.compareStrings(str_source,
                str_target);

        if (SCIA.debug_on) {
            System.err.println("lpsim = " + lpSim);
        }

        double desc_sim = (0.5 * score) + (0.5 * cosine);

        ////////////////////////////////////////////////////////////////////
        //Display the result of description similarity//////////////////////
        ////////////////////////////////////////////////////////////////////
        if (SCIA.debug_on) {
            System.err.println("description similarity = " + desc_sim);
        }
    }

    public static void main(String[] args) throws Exception {
        String SCIA_WORK_DIR = System.getProperty("env.SCIA_WORK_DIR");
        String sourceSchema = new String(SCIA_WORK_DIR +
                "/schemas/schema-PLT-GCED-0409.1.1.xsd");
        String targetSchema = new String(SCIA_WORK_DIR +
                "/schemas/LTER-schema.xsd");

        SchemaTree sourceTree = new SchemaTree((new XMLSchemaImporter(
                    sourceSchema)).tree);

        SchemaTree targetTree = new SchemaTree((new XMLSchemaImporter(
                    targetSchema)).tree);
        Hashtable descSimTable = new Hashtable();
        DescriptionMatcher dm = new DescriptionMatcher(targetTree, sourceTree,
                descSimTable);
    }
}
