Add a test

Plus, more plural fixes
author: bjculkin <bjculkin@mix.wvu.edu> 2017-04-04 18:51:52 -0400
committer: bjculkin <bjculkin@mix.wvu.edu> 2017-04-04 18:51:52 -0400
commit: 7b56beefe4df24acd8437bf42262e7cd30d43970 (patch)
tree: 0f8a4e98897ba9b42fc705ca39a21eed44943941 /src/examples
parent: e4aa727c5ae37ef4d8df45ef535719f626b10917 (diff)
1 files changed, 176 insertions, 2 deletions
diff --git a/src/examples/java/bjc/inflexion/InflexionTester.java b/src/examples/java/bjc/inflexion/InflexionTester.java
index 5f95de7..a11d168 100644
--- a/src/examples/java/bjc/inflexion/InflexionTester.java
+++ b/src/examples/java/bjc/inflexion/InflexionTester.java
@@ -19,7 +19,19 @@ import bjc.inflexion.v2.Noun;
 import bjc.inflexion.v2.Nouns;
 import bjc.inflexion.v2.Prepositions;
 
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Scanner;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 
 /**
  * Test inflecting words.
@@ -46,7 +58,9 @@ public class InflexionTester {
 
 		Scanner scn = new Scanner(System.in);
 
-		System.out.print("Enter a noun to inflect (blank line to quit): ");
+		wikitest(scn, nounDB);
+
+		/*System.out.print("Enter a noun to inflect (blank line to quit): ");
 		String ln = scn.nextLine().trim();
 
 		while(!ln.equals("")) {
@@ -63,8 +77,168 @@ public class InflexionTester {
 
 			System.out.print("Enter a noun to inflect (blank line to quit): ");
 			ln = scn.nextLine().trim();
-		}
+		}*/
 
 		scn.close();
 	}
+
+	@SuppressWarnings("unused")
+	private static void wikitest(Scanner scn, Nouns nounDB) {
+		System.out.print("Enter name of dump file: ");
+
+		String fname = scn.nextLine().trim();
+
+		try(InputStream compressedStream = new FileInputStream(fname)) {
+			InputStream stream = new BZip2CompressorInputStream(compressedStream);
+			BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
+
+			/*
+			 * Pattern find word name
+			 */
+			Pattern titlePattern = Pattern.compile("<title>([^<]+)</title>");
+			/*
+			 * Pattern to find beginning of wiki text
+			 */
+			Pattern textPattern = Pattern.compile("<text");
+			/*
+			 * Pattern to find rank definition
+			 */
+			Pattern rankPattern = Pattern.compile("\\{\\{rank");
+			/*
+			 * Pattern to find noun definition
+			 */
+			Pattern enNounPattern = Pattern.compile("\\{\\{en-noun([a-z0-9\\|\\-\\[\\]\\?\\!=]*)\\}\\}");
+
+			Pattern wordPattern = Pattern.compile("([a-zA-Z\\-]+)");
+
+			String line;
+			String word = "";
+			int text = 0;
+			int count = 0;
+			int basicCount = 0;
+			int wrong = 0;
+			int basicWrong = 0;
+			int wrongNoPlural = 0;
+			int wrongUncountable = 0;
+			boolean basicWord = false;
+			while((line = reader.readLine()) != null) {
+				Matcher titleMatcher = titlePattern.matcher(line);
+				if(titleMatcher.find()) {
+					word = titleMatcher.group(1);
+					if(word.startsWith("Wiktionary:")) {
+						continue;
+					}
+					basicWord = false;
+					text = 0;
+					continue;
+				}
+				Matcher textMatcher = textPattern.matcher(line);
+				if(textMatcher.find()) {
+					text++;
+					continue;
+				}
+				Matcher rankMatcher = rankPattern.matcher(line);
+				if(rankMatcher.find()) {
+					basicWord = true;
+					basicCount++;
+				}
+				if(text != 1) {
+					continue;
+				}
+				Matcher enNounMatcher = enNounPattern.matcher(line);
+				if(enNounMatcher.find()) {
+					// only first
+					/*
+					 * if (text != 1) { continue; }
+					 */
+					text++;
+					count++;
+					if(count % 5000 == 0) {
+						System.out.println(count);
+					}
+					String[] rules = enNounMatcher.group(1).split("\\|");
+					List<String> plurals = new ArrayList<>();
+
+					boolean uncountable = false;
+					boolean noPlural = false;
+					for(String rule : rules) {
+						if(rule.isEmpty()) {
+							continue;
+						}
+						if("-".equals(rule)) {
+							plurals.add(word);
+							uncountable = true;
+						} else if("s".equals(rule)) {
+							plurals.add(word + "s");
+						} else if("es".equals(rule)) {
+							plurals.add(word + "es");
+						} else if("!".equals(rule)) {
+							plurals.add("plural not attested");
+							uncountable = true;
+						} else if("?".equals(rule)) {
+							plurals.add("unknown");
+							noPlural = true;
+						} else {
+							Matcher matcher = wordPattern.matcher(rule);
+							if(matcher.matches()) {
+								plurals.add(rule);
+							}
+						}
+					}
+					if(plurals.isEmpty()) {
+						plurals.add(word + "s");
+					}
+
+					String calculatedPlural = nounDB.getNoun(word).plural();
+					boolean ok = false;
+					for(String plural : plurals) {
+						if(plural.equals(calculatedPlural)) {
+							ok = true;
+							break;
+						}
+					}
+
+					if(!ok) {
+						wrong++;
+						if(uncountable) {
+							wrongUncountable++;
+						} else if(noPlural) {
+							wrongNoPlural++;
+						}
+						if(basicWord) {
+							System.out.println("basic word: " + word + " got: "
+									+ calculatedPlural + ", but expected "
+									+ enNounMatcher.group(1));
+							basicWrong++;
+						} else {
+							System.out.println(word + " got: " + calculatedPlural
+									+ ", but expected " + enNounMatcher.group(1));
+						}
+					}
+				}
+			}
+			reader.close();
+			compressedStream.close();
+
+			float correct = (count - wrong) * 100 / (float) count;
+			float basicCorrect = (basicCount - basicWrong) * 100 / (float) basicCount;
+			float wrongUncountablePercent = wrongUncountable * 100 / (float) count;
+			float wrongNoPluralPercent = wrongNoPlural * 100 / (float) count;
+			int justPlainWrong = wrong - wrongUncountable - wrongNoPlural;
+			float justPlainWrongPercent = justPlainWrong * 100 / (float) count;
+			System.out.println("Words checked: " + count + " (" + basicCount + " basic words)");
+			System.out.println("Correct: " + correct + "% (" + basicCorrect + "% basic words)");
+			System.out.println("Errors: ");
+			System.out.println(
+					"    Uncountable: " + wrongUncountable + " (" + wrongUncountablePercent + "%)");
+			System.out.println("    No plural form specified: " + wrongNoPlural + " ("
+					+ wrongNoPluralPercent + "%)");
+			System.out.println("    Incorrect answer: " + justPlainWrong + " (" + justPlainWrongPercent
+					+ "%)");
+		} catch(FileNotFoundException fnfex) {
+			fnfex.printStackTrace();
+		} catch(IOException ioex) {
+			ioex.printStackTrace();
+		}
+	}
 }
 \ No newline at end of file
author	bjculkin <bjculkin@mix.wvu.edu>	2017-04-04 18:51:52 -0400
committer	bjculkin <bjculkin@mix.wvu.edu>	2017-04-04 18:51:52 -0400
commit	7b56beefe4df24acd8437bf42262e7cd30d43970 (patch)
tree	0f8a4e98897ba9b42fc705ca39a21eed44943941 /src/examples
parent	e4aa727c5ae37ef4d8df45ef535719f626b10917 (diff)