default.dfPackage

2000-06-13T16:00:00Z

ATN.class

2000-06-12T16:00:00Z

ATN.java

2000-06-12T16:00:00Z

/**
 * Title:        ATN<p>
 * Description:  Implements a simpl ATN parser that uses WordNet data
 * Copyright:    Copyright (c) by Mark Watson, 2000<p>
 * @author Mark Watson
 * @version 1.2
 */

import java.io.*;
import java.util.*;

public class ATN {

public ATN() {
        try {
	    // the following code will read either a local file of a
	    // resource in a JAR file:
            InputStream ins =
		ClassLoader.getSystemResourceAsStream("wncache.dat");
            if (ins==null) {
                System.out.println("Failed to open 'wncache.dat'");
                System.exit(1);
            } else {
                ObjectInputStream p = new ObjectInputStream(ins);
                adj = (Hashtable)p.readObject();
                adv = (Hashtable)p.readObject();
                noun = (Hashtable)p.readObject();
                verb = (Hashtable)p.readObject();
                ins.close();
            }
	    // Augment the WordNet 1.6 entries:
	    art = new Hashtable(); addWords(art, ARTS);
	    conj = new Hashtable(); addWords(conj, CONJS);
	    det = new Hashtable(); addWords(det, DETS);
	    pron = new Hashtable(); addWords(pron, PRONS);
	    prep = new Hashtable(); addWords(prep, PREPS);
	    // fill in a few common verbs that are not in Wordnet 1.6:
	    verb.put("ran", b);
       } catch (Exception e) {
            e.printStackTrace();
        }
    }
    private Boolean b = new Boolean(true);

private void addWords(Hashtable h, String [] ws) {
	for (int i=0; i<ws.length; i++) {
	    h.put(ws[i], b);
	}
    }

Hashtable adj, adv, art, conj, det, noun, pron, verb, prep;

String [] PRONS = {"he", "she", "me", "it", "you", "I"};
    String [] ARTS  = {"the", "a", "an"};
    String [] CONJS = {"and", "or"};
    String [] DETS = {"who", "what", "where", "when"};
    String [] PREPS = {"on", "at", "under", "above", "behind", "to", "about","down"};
    private boolean checkWord(String word, int type) {
	if (type == PREP) {
	    if (prep.get(word) != null) return true;
	} else if (type == VERB) {
	    if (verb.get(word) != null) return true;
	    // some simple kluges to accept words like "likes" when
	    // only "like" is in the lexicon:
	    if (word.endsWith("s") || word.endsWith("ed")) {
		String s = word.substring(0, word.length() - 1);
		if (verb.get(s) != null) return true;
	    }
	} else if (type == NOUN) {
	    if (noun.get(word) != null) return true;
	} else if (type == CONJ) {
	    if (conj.get(word) != null) return true;
	} else if (type == ADJ) {
	    if (adj.get(word) != null) return true;
	} else if (type == ADV) {
	    if (adv.get(word) != null) return true;
	} else if (type == PRON) {
	    if (pron.get(word) != null) return true;
	} else if (type == DET) {
	    if (det.get(word) != null) return true;
	} else if (type == ART) {
	    if (art.get(word) != null) return true;
	}
	return false;
    }

public int [] parse(String s) {
        Vector v = new Vector();
        StringTokenizer st = new StringTokenizer(s);
        while (st.hasMoreTokens()) {
            String str = st.nextToken();
            if (str.length() > 2 && str.endsWith(",")) {
		str = str.substring(0, str.length() - 1);
	    }
            if (str.length() > 2 && str.endsWith(".")) {
		str = str.substring(0, str.length() - 1);
	    }
            if (str.length() > 2 && str.endsWith(":")) {
		str = str.substring(0, str.length() - 1);
	    }
            if (str.length() > 2 && str.endsWith(";")) {
		str = str.substring(0, str.length() - 1);
	    }
            v.addElement(str.toLowerCase());
        }
	// It is easier to work with an array, so convert the Vector
	// to an array of Java strings:
	int size = v.size();
	if (size == 0) return null;
	words = new String[size];
	partsOfSpeech = new int[size];
	num_words = size;
	for (int i=0; i<size; i++) words[i] = (String)v.elementAt(i);
	// quick test against lexicon for word types:
	for (int i=0; i<words.length; i++) {
	    System.out.print("'" + words[i] + "' possible word types: ");
	    if (adj.get(words[i]) != null) System.out.print("adj ");
	    if (adv.get(words[i]) != null) System.out.print("adv ");
	    if (art.get(words[i]) != null) System.out.print("art ");
	    if (noun.get(words[i]) != null) System.out.print("noun ");
	    if (prep.get(words[i]) != null) System.out.print("prep ");
	    if (verb.get(words[i]) != null) System.out.print("verb ");
	    System.out.println();
	}
	System.out.println();
	// execute the parsing helper methods until one succeeds:
	parse_it();
	return null;
    }

String [] words;
    int [] partsOfSpeech;
    int wordIndex;
    int num_words;
    
    public static void main(String [] args) {
	ATN nf = new ATN();
	if (args.length < 1) {
	    nf.parse("the dog ran down the street");
	} else {
	    for (int i=0; i<args.length; i++) {
		System.out.println("\nProcessing : " + args[i]);
		nf.parse(args[i]);
	    }
	}
    }

//////////////// ATN functions:

String getPOSname(int pos) {
	switch (pos) {
	case 1: return "NP";
	case 2: return "VP";
	case 3: return "PP";
	case 1001: return "noun";
	case 1002: return "verb";
	case 1003: return "prep";
	case 1004: return "conj";
	case 1005: return "adj";
	case 1006: return "adv";
	case 1007: return "pron";
	case 1008: return "det";
	case 1009: return "art";
	default: return "unknown";
	}
    }
    public final static int NP = 1;
    public final static int VP = 2;
    public final static int PP = 3;

public final static int NOUN = 1001;
    public final static int VERB = 1002;
    public final static int PREP = 1003;
    public final static int CONJ = 1004;
    public final static int ADJ  = 1005;
    public final static int ADV  = 1006;
    public final static int PRON = 1007;
    public final static int DET  = 1008;
    public final static int ART  = 1009;

public final static int NUM_S = 9;
    //    int [] LEN_S   = { 5,  4,  3,  4,  3,  2,  2,  2,  1};

int [] ALL_S [] = {
	{NP, VP, NP, PP, VP},
	{NP, VP, PP, NP},
	{NP, VP, NP},
	{VP, NP, PP, NP},
	{VP, PP, NP},
	{NP, VP},
	{VP, PP},
	{VP, NP},
	{VP}
    };

//////////////// The actual parser:

int parsePP(int start_word_index, int word_index) {
	if (word_index >= num_words)  return word_index;
	// test ATN transitions <PREP> --> <NP>
	if (checkWord(words[word_index], PREP)) {
	    partsOfSpeech[start_word_index + word_index] = PREP;
	    int ii = parseNP(start_word_index, word_index + 1);
	    if (ii > -1) {
		return ii;
	    }
	}
	return -1;
    }

int parseNP(int start_word_index, int word_index) {
	if (word_index >= num_words)  return word_index;
	// test ATN transitions <NOUN> --> <CONJ> --> <NP>
	if (word_index < num_words - 2 && checkWord(words[word_index], NOUN)) {
	    if (checkWord(words[word_index + 1], CONJ)) {
		int ii = parseNP(start_word_index, word_index + 2);
		if (ii > -1) {
		    partsOfSpeech[start_word_index + word_index] = NOUN;
		    partsOfSpeech[start_word_index + word_index + 1] = CONJ;
		    return ii;
		}
	    }
	}
	// test ATN transitions <ART> --> <NP>
	if (word_index < num_words - 1 && checkWord(words[word_index], ART)) {
	    int ii = parseNP(start_word_index, word_index + 1);
	    if (ii > -1) {
		partsOfSpeech[start_word_index + word_index] = ART;
		return ii;
	    }
	}
	// test ATN transitions <DET> --> <NP>
	if (word_index < num_words - 1 && checkWord(words[word_index], ADJ)) {
	    int ii = parseNP(start_word_index, word_index + 1);
	    if (ii > -1) {
		partsOfSpeech[start_word_index + word_index] = ADJ;
		return ii;
	    }
	}
	// test ATN transitions <ADJ> --> <NP>
	if (checkWord(words[word_index], ADJ)) {
	    int ii = parseNP(start_word_index, word_index + 1);
	    if (ii > -1) {
		partsOfSpeech[start_word_index + word_index] = ADJ;
		return ii;
	    }
	}
	// test ATN transitions <ADV> --> <NP>
	if (word_index < num_words - 1 && checkWord(words[word_index], ADV)) {
	    int ii = parseNP(start_word_index, word_index + 1);
	    if (ii > -1) {
		partsOfSpeech[start_word_index + word_index] = ADV;
		return ii;
	    }
	}
	// test ATN transitions <NOUN> --> <NOUN>
	if (word_index < num_words - 1 && checkWord(words[word_index], NOUN)) {
	    if (checkWord(words[word_index + 1], NOUN)) {
		partsOfSpeech[start_word_index + word_index] = NOUN;
		partsOfSpeech[start_word_index + word_index + 1] = NOUN;
		return word_index + 2;
	    }
	}
	if (checkWord(words[word_index], NOUN)) {
	    partsOfSpeech[start_word_index + word_index] = NOUN;
	    return word_index + 1;
	}
	if (checkWord(words[word_index], PRON)) {
	    int ii = parseNP(start_word_index, word_index + 1);
	    if (ii > -1) {
		partsOfSpeech[start_word_index + word_index] = PRON;
		return ii;
	    }
	}
	if (checkWord(words[word_index], PRON)) {
	    partsOfSpeech[start_word_index + word_index] = PRON;
	    return word_index + 1;
	}
	return -1;
    }

int parseVP(int start_word_index, int word_index) {
	if (word_index >= num_words)  return word_index;
	// test ATN transitions <V> --> <NP> --> <PP>
	if (checkWord(words[word_index], VERB)) {
	    partsOfSpeech[start_word_index + word_index] = VERB;
	    int ii = parseNP(start_word_index, word_index + 1);
	    if (ii > -1) {
		int jj = parsePP(start_word_index, ii);
		if (jj > -1) {
		    return jj;
		}
	    }
	}
	// test ATN transitions <V> --> <NP>
	if (checkWord(words[word_index], VERB)) {
	    partsOfSpeech[start_word_index + word_index] = VERB;
	    int ii = parseNP(start_word_index, word_index + 1);
	    if (ii > -1) {
		return ii;
	    }
	}
	// test ATN transitions <V> --> <PP>
	if (checkWord(words[word_index], VERB)) {
	    partsOfSpeech[start_word_index + word_index] = VERB;
	    int ii = parsePP(start_word_index, word_index + 1);
	    if (ii > -1) {
		return ii;
	    }
	}
	if (checkWord(words[word_index], VERB)) {
	    partsOfSpeech[start_word_index + word_index] = VERB;
	    return word_index + 1;
	}
	return -1;
    }

int parseHelper(int [] atn, int start_word_index) {
	int word_index = 0;
	int len_atn = atn.length;
	int last_word_index = word_index;
	for (int i=0; i<len_atn; i++) {
	    last_word_index = word_index;
	    switch (atn[i]) {
	    case NP: word_index = parseNP(start_word_index, word_index);  break;
	    case VP: word_index = parseVP(start_word_index, word_index);  break;
	    case PP: word_index = parsePP(start_word_index, word_index);  break;
	    }
	    if (word_index == -1)  return last_word_index;
	}
	return word_index;
    }

int parseSentence(int start_word_index) {

int max_val = -1;
	int max_word_index = 0;
	for (int i=0; i<NUM_S; i++) {
	    int k = parseHelper(ALL_S[i], start_word_index);
	    //System.out.println("Score for ATN " + i + " is " + k);
	    if (k > max_val) {
		max_val = k;
		max_word_index = i;
	    }
	}
	System.out.println("Best ATN at word_index " + max_word_index);
	parseHelper(ALL_S[max_word_index], start_word_index);
	for (int i=0; i<num_words; i++) {
	    if (partsOfSpeech[start_word_index + i] == 0) {
		if (checkWord(words[i], NOUN)) {
		    partsOfSpeech[start_word_index + i] = NOUN;
		}
		if (checkWord(words[i], CONJ)) {
		    partsOfSpeech[start_word_index + i] = CONJ;
		}
	    }
	}
	return max_val;
    }

void parse_it() {
	int word_index = parseSentence(0);
	//System.out.println("word_index from S ATN = " + word_index);
	for (int i=0; i<num_words; i++) {
	    System.out.println(" word: " + words[i] + 
			       " part of speech: " + getPOSname(partsOfSpeech[i]));
	}
    }

}

MakeWordNetCache.class

2000-06-12T16:00:00Z

MakeWordNetCache.java

2000-06-12T16:00:00Z

/**
 * Title:        MakeWordNetCache<p>
 * Description:  Reads WordNet 1.6 index files and makes a part of speech
 *               serialized file consisting of 4 Java hash tables (for adj,
 *               adv, noun, and verb).
 * Copyright:    Copyright (c) by Mark Watson, 2000<p>
 * @author Mark Watson
 * @version 1.1
 */

/**      Wordnet 1.6 Copyright and License:

  1 This software and database is being provided to you, the LICENSEE, by  
  2 Princeton University under the following license.  By obtaining, using  
  3 and/or copying this software and database, you agree that you have  
  4 read, understood, and will comply with these terms and conditions.:  
  5   
  6 Permission to use, copy, modify and distribute this software and  
  7 database and its documentation for any purpose and without fee or  
  8 royalty is hereby granted, provided that you agree to comply with  
  9 the following copyright notice and statements, including the disclaimer,  
  10 and that the same appear on ALL copies of the software, database and  
  11 documentation, including modifications that you make for internal  
  12 use or for distribution.  
  13   
  14 WordNet 1.6 Copyright 1997 by Princeton University.  All rights reserved.  
  15   
  16 THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON  
  17 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR  
  18 IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON  
  19 UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-  
  20 ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE  
  21 OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT  
  22 INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR  
  23 OTHER RIGHTS.  
  24   
  25 The name of Princeton University or Princeton may not be used in  
  26 advertising or publicity pertaining to distribution of the software  
  28 and/or database.  Title to copyright in this software, database and  
  29 any associated documentation shall at all times remain with  
  30 Princeton University and LICENSEE agrees to preserve same.  

*/

import java.io.*;
import java.util.*;

public class MakeWordNetCache {

    Hashtable adj = new Hashtable();
    Hashtable adv = new Hashtable();
    Hashtable noun = new Hashtable();
    Hashtable verb = new Hashtable();

    public MakeWordNetCache() {
        helper("index.adj", adj);
        helper("index.adv", adv);
        helper("index.noun", noun);
        helper("index.verb", verb);
        //System.out.println(verb.get("run"));
        try {
            FileOutputStream ostream = new FileOutputStream("wncache.dat");
            ObjectOutputStream p = new ObjectOutputStream(ostream);
            p.writeObject(adj);
            p.writeObject(adv);
            p.writeObject(noun);
            p.writeObject(verb);
            p.flush();
            ostream.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    Boolean t = new Boolean(true);

    public void helper(String file, Hashtable hash) {
	int count = 0;
        try {
            FileReader fr = new FileReader(file);
            BufferedReader br = new BufferedReader(fr);
	    // skip copyright notice:
	    for (int i=0; i<30; i++) br.readLine();
            while (true) {
                String line = br.readLine();
                if (line == null) break;
                line = line.trim();
                int index1 = line.indexOf(" ");
                if (index1 == -1) continue;
		line = line.substring(0, index1);
                int index2 = line.indexOf(".");
                if (index2 != -1) continue;
                index2 = line.indexOf("_");
                if (index2 != -1) continue;
		line = line.toLowerCase();
                Object o = hash.get(line);
                if (o == null) {
                    hash.put(line, t);
                    //System.out.println(file + " : " + line);
		    count++;
                }
            }
	    System.out.println("" + count + " words added for " + file);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {
        MakeWordNetCache MakeWordNetCache1 = new MakeWordNetCache();
    }
}

wncache.dat

2000-05-12T16:00:00Z

carfield.com.hk

default.dfPackage

ATN.class

ATN.java

MakeWordNetCache.class

MakeWordNetCache.java

wncache.dat