3 files changed, 245 insertions, 36 deletions
diff --git a/data/nouns.txt b/data/nouns.txt
index 4122dc8..c257bd6 100644
--- a/data/nouns.txt
+++ b/data/nouns.txt
@@ -986,6 +986,7 @@ ascidium         =>                 |  ascidia
 asylum           =>  asylums        |  asyla
 avicularium      =>                 |  avicularia
 axopodium        =>                 |  axopodia
+baculum          =>                 |  bacula
 bacterium        =>                 |  bacteria
 bifolium         =>                 |  bifolia
 caecum           =>                 |  caeca
@@ -1321,7 +1322,7 @@ alumnus         =>                  |  alumni
 alveolus        =>                  |  alveoli
 aptychus        =>                  |  aptychi
 aureus          =>                  |  aurei
-*bacillus       =>                 |  *bacilli
+*bacillus       =>                  |  *bacilli
 bronchus        =>                  |  bronchi
 bulimus         =>                  |  bulimi
 cactus          =>  cactuses        |  cacti
@@ -1961,6 +1962,7 @@ testudo        =>  testudos
 timpano        =>  timpanos
 tiro           =>  tiros
 tobacco        =>  tobaccos
+todo           =>  todos
 Togo           =>  Togos
 Tokyo          =>  Tokyos
 torero         =>  toreros
@@ -2736,9 +2738,17 @@ its     => theirs
 
 # Standard patterns of inflection for other nouns (in increasing order of generality)...
 
--[aeiou]o =>  -[aeiou]os |
--[aeo]lf  =>             |  -[aeo]lves
--[aiy]nx  =>  -[aiy]nxes |  -[aiy]nges
+-ao =>  -aos |
+-eo =>  -eos |
+-io =>  -ios |
+-oo =>  -oos |
+-uo =>  -uos |
+-alf  =>             |  -alves
+-elf  =>             |  -elves
+-olf  =>             |  -olves
+-anx  =>  -anxes |  -anges
+-inx  =>  -inxes |  -inges
+-ynx  =>  -ynxes |  -ynges
 -arf      =>             |  -arves
 -ceps     =>  -ceps      |
 -ch       =>  -ches      |
@@ -2750,9 +2760,14 @@ its     => theirs
 -oe       =>  -oes       |
 -o        =>  -oes       |
 -quy      =>  -quies     |
--[aeiou]y =>  -[aeiou]ys |
+-ay =>  -ays |
+-ey =>  -eys |
+-iy =>  -iys |
+-oy =>  -oys |
+-uy =>  -uys |
 -ss       =>  -sses      |
--[^s]sis  =>             |  -[^s]ses
+-sis  =>             |  -ses
+#-[^s]sis  =>             |  -[^s]ses
 -trix     =>  -trixes    |  -trices
 -us       =>  -uses      |
 -x        =>  -xes       |
diff --git a/pom.xml b/pom.xml
index b1364e4..d577778 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,29 +1,49 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-	<modelVersion>4.0.0</modelVersion>
-	<groupId>inflexion</groupId>
-	<artifactId>inflexion</artifactId>
-	<version>0.0.1-SNAPSHOT</version>
-	<name>Inflexion</name>
-	<description>Java based implementation of Damian Conway's pluralization algorithm.</description>
-
-	<build>
-		<plugins>
-			<plugin>
-				<artifactId>maven-compiler-plugin</artifactId>
-				<configuration>
-					<source>1.8</source>
-					<target>1.8</target>
-				</configuration>
-			</plugin>
-		</plugins>
-		<resources>
-			<resource>
-				<directory>data/</directory>
-				<includes>
-					<include>**/*.txt</include>
-				</includes>
-			</resource>
-		</resources>
-	</build>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+	<groupId>bjc</groupId>
+	<artifactId>inflexion</artifactId>
+	<version>0.0.1-SNAPSHOT</version>
+	<name>Inflexion</name>
+	<description>Java based implementation of Damian Conway's Lingua::EN::Inflexion module for perl</description>
+
+	<licenses>
+		<license>
+			<name>Apache License, Version 2.0</name>
+			<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+			<distribution>repo</distribution>
+		</license>
+	</licenses>
+
+	<scm>
+		<connection>scm:git:https://github.com/bculkin2442/Inflexion.git</connection>
+		<url>https://github.com/atteo/Inflexion</url>
+	</scm>
+
+	<build>
+		<plugins>
+			<plugin>
+				<artifactId>maven-compiler-plugin</artifactId>
+				<configuration>
+					<source>1.8</source>
+					<target>1.8</target>
+				</configuration>
+			</plugin>
+		</plugins>
+		<resources>
+			<resource>
+				<directory>data/</directory>
+				<includes>
+					<include>**/*.txt</include>
+				</includes>
+			</resource>
+		</resources>
+	</build>
+	<dependencies>
+		<dependency>
+			<groupId>org.apache.commons</groupId>
+			<artifactId>commons-compress</artifactId>
+			<version>1.13</version>
+		</dependency>
+	</dependencies>
 </project>
 \ No newline at end of file
diff --git a/src/examples/java/bjc/inflexion/InflexionTester.java b/src/examples/java/bjc/inflexion/InflexionTester.java
index 5f95de7..a11d168 100644
--- a/src/examples/java/bjc/inflexion/InflexionTester.java
+++ b/src/examples/java/bjc/inflexion/InflexionTester.java
@@ -19,7 +19,19 @@ import bjc.inflexion.v2.Noun;
 import bjc.inflexion.v2.Nouns;
 import bjc.inflexion.v2.Prepositions;
 
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Scanner;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 
 /**
  * Test inflecting words.
@@ -46,7 +58,9 @@ public class InflexionTester {
 
 		Scanner scn = new Scanner(System.in);
 
-		System.out.print("Enter a noun to inflect (blank line to quit): ");
+		wikitest(scn, nounDB);
+
+		/*System.out.print("Enter a noun to inflect (blank line to quit): ");
 		String ln = scn.nextLine().trim();
 
 		while(!ln.equals("")) {
@@ -63,8 +77,168 @@ public class InflexionTester {
 
 			System.out.print("Enter a noun to inflect (blank line to quit): ");
 			ln = scn.nextLine().trim();
-		}
+		}*/
 
 		scn.close();
 	}
+
+	@SuppressWarnings("unused")
+	private static void wikitest(Scanner scn, Nouns nounDB) {
+		System.out.print("Enter name of dump file: ");
+
+		String fname = scn.nextLine().trim();
+
+		try(InputStream compressedStream = new FileInputStream(fname)) {
+			InputStream stream = new BZip2CompressorInputStream(compressedStream);
+			BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
+
+			/*
+			 * Pattern find word name
+			 */
+			Pattern titlePattern = Pattern.compile("<title>([^<]+)</title>");
+			/*
+			 * Pattern to find beginning of wiki text
+			 */
+			Pattern textPattern = Pattern.compile("<text");
+			/*
+			 * Pattern to find rank definition
+			 */
+			Pattern rankPattern = Pattern.compile("\\{\\{rank");
+			/*
+			 * Pattern to find noun definition
+			 */
+			Pattern enNounPattern = Pattern.compile("\\{\\{en-noun([a-z0-9\\|\\-\\[\\]\\?\\!=]*)\\}\\}");
+
+			Pattern wordPattern = Pattern.compile("([a-zA-Z\\-]+)");
+
+			String line;
+			String word = "";
+			int text = 0;
+			int count = 0;
+			int basicCount = 0;
+			int wrong = 0;
+			int basicWrong = 0;
+			int wrongNoPlural = 0;
+			int wrongUncountable = 0;
+			boolean basicWord = false;
+			while((line = reader.readLine()) != null) {
+				Matcher titleMatcher = titlePattern.matcher(line);
+				if(titleMatcher.find()) {
+					word = titleMatcher.group(1);
+					if(word.startsWith("Wiktionary:")) {
+						continue;
+					}
+					basicWord = false;
+					text = 0;
+					continue;
+				}
+				Matcher textMatcher = textPattern.matcher(line);
+				if(textMatcher.find()) {
+					text++;
+					continue;
+				}
+				Matcher rankMatcher = rankPattern.matcher(line);
+				if(rankMatcher.find()) {
+					basicWord = true;
+					basicCount++;
+				}
+				if(text != 1) {
+					continue;
+				}
+				Matcher enNounMatcher = enNounPattern.matcher(line);
+				if(enNounMatcher.find()) {
+					// only first
+					/*
+					 * if (text != 1) { continue; }
+					 */
+					text++;
+					count++;
+					if(count % 5000 == 0) {
+						System.out.println(count);
+					}
+					String[] rules = enNounMatcher.group(1).split("\\|");
+					List<String> plurals = new ArrayList<>();
+
+					boolean uncountable = false;
+					boolean noPlural = false;
+					for(String rule : rules) {
+						if(rule.isEmpty()) {
+							continue;
+						}
+						if("-".equals(rule)) {
+							plurals.add(word);
+							uncountable = true;
+						} else if("s".equals(rule)) {
+							plurals.add(word + "s");
+						} else if("es".equals(rule)) {
+							plurals.add(word + "es");
+						} else if("!".equals(rule)) {
+							plurals.add("plural not attested");
+							uncountable = true;
+						} else if("?".equals(rule)) {
+							plurals.add("unknown");
+							noPlural = true;
+						} else {
+							Matcher matcher = wordPattern.matcher(rule);
+							if(matcher.matches()) {
+								plurals.add(rule);
+							}
+						}
+					}
+					if(plurals.isEmpty()) {
+						plurals.add(word + "s");
+					}
+
+					String calculatedPlural = nounDB.getNoun(word).plural();
+					boolean ok = false;
+					for(String plural : plurals) {
+						if(plural.equals(calculatedPlural)) {
+							ok = true;
+							break;
+						}
+					}
+
+					if(!ok) {
+						wrong++;
+						if(uncountable) {
+							wrongUncountable++;
+						} else if(noPlural) {
+							wrongNoPlural++;
+						}
+						if(basicWord) {
+							System.out.println("basic word: " + word + " got: "
+									+ calculatedPlural + ", but expected "
+									+ enNounMatcher.group(1));
+							basicWrong++;
+						} else {
+							System.out.println(word + " got: " + calculatedPlural
+									+ ", but expected " + enNounMatcher.group(1));
+						}
+					}
+				}
+			}
+			reader.close();
+			compressedStream.close();
+
+			float correct = (count - wrong) * 100 / (float) count;
+			float basicCorrect = (basicCount - basicWrong) * 100 / (float) basicCount;
+			float wrongUncountablePercent = wrongUncountable * 100 / (float) count;
+			float wrongNoPluralPercent = wrongNoPlural * 100 / (float) count;
+			int justPlainWrong = wrong - wrongUncountable - wrongNoPlural;
+			float justPlainWrongPercent = justPlainWrong * 100 / (float) count;
+			System.out.println("Words checked: " + count + " (" + basicCount + " basic words)");
+			System.out.println("Correct: " + correct + "% (" + basicCorrect + "% basic words)");
+			System.out.println("Errors: ");
+			System.out.println(
+					"    Uncountable: " + wrongUncountable + " (" + wrongUncountablePercent + "%)");
+			System.out.println("    No plural form specified: " + wrongNoPlural + " ("
+					+ wrongNoPluralPercent + "%)");
+			System.out.println("    Incorrect answer: " + justPlainWrong + " (" + justPlainWrongPercent
+					+ "%)");
+		} catch(FileNotFoundException fnfex) {
+			fnfex.printStackTrace();
+		} catch(IOException ioex) {
+			ioex.printStackTrace();
+		}
+	}
 }
 \ No newline at end of file