From 7b56beefe4df24acd8437bf42262e7cd30d43970 Mon Sep 17 00:00:00 2001 From: bjculkin Date: Tue, 4 Apr 2017 18:51:52 -0400 Subject: Add a test Plus, more plural fixes --- data/nouns.txt | 27 +++- pom.xml | 76 +++++---- .../java/bjc/inflexion/InflexionTester.java | 178 ++++++++++++++++++++- 3 files changed, 245 insertions(+), 36 deletions(-) diff --git a/data/nouns.txt b/data/nouns.txt index 4122dc8..c257bd6 100644 --- a/data/nouns.txt +++ b/data/nouns.txt @@ -986,6 +986,7 @@ ascidium => | ascidia asylum => asylums | asyla avicularium => | avicularia axopodium => | axopodia +baculum => | bacula bacterium => | bacteria bifolium => | bifolia caecum => | caeca @@ -1321,7 +1322,7 @@ alumnus => | alumni alveolus => | alveoli aptychus => | aptychi aureus => | aurei -*bacillus => | *bacilli +*bacillus => | *bacilli bronchus => | bronchi bulimus => | bulimi cactus => cactuses | cacti @@ -1961,6 +1962,7 @@ testudo => testudos timpano => timpanos tiro => tiros tobacco => tobaccos +todo => todos Togo => Togos Tokyo => Tokyos torero => toreros @@ -2736,9 +2738,17 @@ its => theirs # Standard patterns of inflection for other nouns (in increasing order of generality)... --[aeiou]o => -[aeiou]os | --[aeo]lf => | -[aeo]lves --[aiy]nx => -[aiy]nxes | -[aiy]nges +-ao => -aos | +-eo => -eos | +-io => -ios | +-oo => -oos | +-uo => -uos | +-alf => | -alves +-elf => | -elves +-olf => | -olves +-anx => -anxes | -anges +-inx => -inxes | -inges +-ynx => -ynxes | -ynges -arf => | -arves -ceps => -ceps | -ch => -ches | @@ -2750,9 +2760,14 @@ its => theirs -oe => -oes | -o => -oes | -quy => -quies | --[aeiou]y => -[aeiou]ys | +-ay => -ays | +-ey => -eys | +-iy => -iys | +-oy => -oys | +-uy => -uys | -ss => -sses | --[^s]sis => | -[^s]ses +-sis => | -ses +#-[^s]sis => | -[^s]ses -trix => -trixes | -trices -us => -uses | -x => -xes | diff --git a/pom.xml b/pom.xml index b1364e4..d577778 100644 --- a/pom.xml +++ b/pom.xml @@ -1,29 +1,49 @@ - - 4.0.0 - inflexion - inflexion - 0.0.1-SNAPSHOT - Inflexion - Java based implementation of Damian Conway's pluralization algorithm. - - - - - maven-compiler-plugin - - 1.8 - 1.8 - - - - - - data/ - - **/*.txt - - - - + + 4.0.0 + bjc + inflexion + 0.0.1-SNAPSHOT + Inflexion + Java based implementation of Damian Conway's Lingua::EN::Inflexion module for perl + + + + Apache License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + scm:git:https://github.com/bculkin2442/Inflexion.git + https://github.com/atteo/Inflexion + + + + + + maven-compiler-plugin + + 1.8 + 1.8 + + + + + + data/ + + **/*.txt + + + + + + + org.apache.commons + commons-compress + 1.13 + + \ No newline at end of file diff --git a/src/examples/java/bjc/inflexion/InflexionTester.java b/src/examples/java/bjc/inflexion/InflexionTester.java index 5f95de7..a11d168 100644 --- a/src/examples/java/bjc/inflexion/InflexionTester.java +++ b/src/examples/java/bjc/inflexion/InflexionTester.java @@ -19,7 +19,19 @@ import bjc.inflexion.v2.Noun; import bjc.inflexion.v2.Nouns; import bjc.inflexion.v2.Prepositions; +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; import java.util.Scanner; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; /** * Test inflecting words. @@ -46,7 +58,9 @@ public class InflexionTester { Scanner scn = new Scanner(System.in); - System.out.print("Enter a noun to inflect (blank line to quit): "); + wikitest(scn, nounDB); + + /*System.out.print("Enter a noun to inflect (blank line to quit): "); String ln = scn.nextLine().trim(); while(!ln.equals("")) { @@ -63,8 +77,168 @@ public class InflexionTester { System.out.print("Enter a noun to inflect (blank line to quit): "); ln = scn.nextLine().trim(); - } + }*/ scn.close(); } + + @SuppressWarnings("unused") + private static void wikitest(Scanner scn, Nouns nounDB) { + System.out.print("Enter name of dump file: "); + + String fname = scn.nextLine().trim(); + + try(InputStream compressedStream = new FileInputStream(fname)) { + InputStream stream = new BZip2CompressorInputStream(compressedStream); + BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); + + /* + * Pattern find word name + */ + Pattern titlePattern = Pattern.compile("([^<]+)"); + /* + * Pattern to find beginning of wiki text + */ + Pattern textPattern = Pattern.compile(" plurals = new ArrayList<>(); + + boolean uncountable = false; + boolean noPlural = false; + for(String rule : rules) { + if(rule.isEmpty()) { + continue; + } + if("-".equals(rule)) { + plurals.add(word); + uncountable = true; + } else if("s".equals(rule)) { + plurals.add(word + "s"); + } else if("es".equals(rule)) { + plurals.add(word + "es"); + } else if("!".equals(rule)) { + plurals.add("plural not attested"); + uncountable = true; + } else if("?".equals(rule)) { + plurals.add("unknown"); + noPlural = true; + } else { + Matcher matcher = wordPattern.matcher(rule); + if(matcher.matches()) { + plurals.add(rule); + } + } + } + if(plurals.isEmpty()) { + plurals.add(word + "s"); + } + + String calculatedPlural = nounDB.getNoun(word).plural(); + boolean ok = false; + for(String plural : plurals) { + if(plural.equals(calculatedPlural)) { + ok = true; + break; + } + } + + if(!ok) { + wrong++; + if(uncountable) { + wrongUncountable++; + } else if(noPlural) { + wrongNoPlural++; + } + if(basicWord) { + System.out.println("basic word: " + word + " got: " + + calculatedPlural + ", but expected " + + enNounMatcher.group(1)); + basicWrong++; + } else { + System.out.println(word + " got: " + calculatedPlural + + ", but expected " + enNounMatcher.group(1)); + } + } + } + } + reader.close(); + compressedStream.close(); + + float correct = (count - wrong) * 100 / (float) count; + float basicCorrect = (basicCount - basicWrong) * 100 / (float) basicCount; + float wrongUncountablePercent = wrongUncountable * 100 / (float) count; + float wrongNoPluralPercent = wrongNoPlural * 100 / (float) count; + int justPlainWrong = wrong - wrongUncountable - wrongNoPlural; + float justPlainWrongPercent = justPlainWrong * 100 / (float) count; + System.out.println("Words checked: " + count + " (" + basicCount + " basic words)"); + System.out.println("Correct: " + correct + "% (" + basicCorrect + "% basic words)"); + System.out.println("Errors: "); + System.out.println( + " Uncountable: " + wrongUncountable + " (" + wrongUncountablePercent + "%)"); + System.out.println(" No plural form specified: " + wrongNoPlural + " (" + + wrongNoPluralPercent + "%)"); + System.out.println(" Incorrect answer: " + justPlainWrong + " (" + justPlainWrongPercent + + "%)"); + } catch(FileNotFoundException fnfex) { + fnfex.printStackTrace(); + } catch(IOException ioex) { + ioex.printStackTrace(); + } + } } \ No newline at end of file -- cgit v1.2.3