/**
 * Copyright (c) 2005 The Regents of the University of California.
 * All rights reserved.
 *
 * Permission is hereby granted, without written agreement and without
 * license or royalty fees, to use, copy, modify, and distribute this
 * software and its documentation for any purpose, provided that the
 * above copyright notice and the following two paragraphs appear in
 * all copies of this software.
 *
 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
 * IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
 * OF SUCH DAMAGE.
 *
 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
 * OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT,
 * UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
 */
package org.kepler.scia;

import com.interdataworking.mm.alg.StringMatcher;

import java.util.ArrayList;


/**
 * LetterPairSimilarity provides methods for find out how many adjacent
 * character pairs are contained in both strings, to solve problems with
 * existing algorithms, such as Soundex Algorithm, Edit Distance, and Longest
 * Common Substring. by considering adjacent characters, not only of the
 * characters, but also of the character ordering in the original string are
 * taken account, since each character pair contains a little information
 * about the original ordering.
 *
 * @author Simon White
 */
public class LetterPairSimilarity {
    /** @return lexical similarity value in the range [0,1]
        This method computes the character pairs from the words of each of the
        two input strings, then iterates through the ArrayLists to find the size
        of the intersection. Note that whenever a match is found, that character
        pair is removed from the second array list to prevent us from matching
        against the same character pair multiple times. (Otherwise, "GGGG"
        would score a perfect match against "GG".
    */
    public static double compareStrings(String str1, String str2) {
        /*
        If (str1 == null || str1.equals("") || str2 == null || str2.equals("")) {
            return 0.0; // should throw exception
        }
        */
        ArrayList pairs1 = wordLetterPairs(str1.toUpperCase());
        ArrayList pairs2 = wordLetterPairs(str2.toUpperCase());
        int intersection = 0;
        int union = pairs1.size() + pairs2.size();

        for (int i = 0; i < pairs1.size(); i++) {
            Object pair1 = pairs1.get(i);

            for (int j = 0; j < pairs2.size(); j++) {
                Object pair2 = pairs2.get(j);

                if (pair1.equals(pair2)) {
                    intersection++;
                    pairs2.remove(j);

                    break;
                }
            }
        }

        return (2.0 * intersection) / union;
    }

    /** @return an ArrayList of 2-character Strings.
        This method uses the split() method of the String class to split the
        input string into separate words, or tokens. It then iterates through
        each of the words, computing the character pairs for each word. The
        character pairs are added to an ArrayList, which is returned from the
        method. An ArrayList is used, rather than an array, because we do not
        know in advance how many character pairs will be returned. (At this
        point, the program doesn't know how much white space the input string
        contains.
    */
    private static ArrayList wordLetterPairs(String str) {
        ArrayList allPairs = new ArrayList();

        // Tokenize the string and put the tokens/words into an array 
        String[] words = str.split("s");

        // For each word
        for (int w = 0; w < words.length; w++) {
            // Find the pairs of characters
            String[] pairsInWord = letterPairs(words[w]);

            for (int p = 0; p < pairsInWord.length; p++) {
                allPairs.add(pairsInWord[p]);
            }
        }

        return allPairs;
    }

    /** @return an array of adjacent letter pairs contained in the input string
        The basis of the algorithm is the method that computes the pairs of
        characters contained in the input string. This method creates an array
        of Strings to contain its result. It then iterates through the input
        string, to extract character pairs and store them in the array. Finally,
        the array is returned.
    */
    private static String[] letterPairs(String str) {
        int numPairs = str.length() - 1;

        if (numPairs >= 0) {
            String[] pairs = new String[numPairs];

            for (int i = 0; i < numPairs; i++) {
                pairs[i] = str.substring(i, i + 2);
            }

            return pairs;
        }

        return null;
    }

    public static void main(String[] args) {
        StringMatcher sm = new StringMatcher();

        //String tag = "/eml/dataset/datatable/attributes";
        //String tag1 = "/eml/dataset/datatable/attributes/attribute";
        String tag = "/eml/dataset/datatable/attributes/fhashdyua/shadfysa/coverage";
        String tag1 = "/itemRecord/coverage";
        double sim = sm.computeStringSimilarity(tag, tag1);

        double sim1 = compareStrings(tag, tag1);

        //System.out.println("n-gram sim = " + sim);
        //System.out.println("letter pair sim = " + sim1);
    }
}
