diff options
| -rw-r--r-- | data/nouns.txt | 27 | ||||
| -rw-r--r-- | pom.xml | 76 | ||||
| -rw-r--r-- | src/examples/java/bjc/inflexion/InflexionTester.java | 178 |
3 files changed, 245 insertions, 36 deletions
diff --git a/data/nouns.txt b/data/nouns.txt index 4122dc8..c257bd6 100644 --- a/data/nouns.txt +++ b/data/nouns.txt @@ -986,6 +986,7 @@ ascidium => | ascidia asylum => asylums | asyla avicularium => | avicularia axopodium => | axopodia +baculum => | bacula bacterium => | bacteria bifolium => | bifolia caecum => | caeca @@ -1321,7 +1322,7 @@ alumnus => | alumni alveolus => | alveoli aptychus => | aptychi aureus => | aurei -*bacillus => | *bacilli +*bacillus => | *bacilli bronchus => | bronchi bulimus => | bulimi cactus => cactuses | cacti @@ -1961,6 +1962,7 @@ testudo => testudos timpano => timpanos tiro => tiros tobacco => tobaccos +todo => todos Togo => Togos Tokyo => Tokyos torero => toreros @@ -2736,9 +2738,17 @@ its => theirs # Standard patterns of inflection for other nouns (in increasing order of generality)... --[aeiou]o => -[aeiou]os | --[aeo]lf => | -[aeo]lves --[aiy]nx => -[aiy]nxes | -[aiy]nges +-ao => -aos | +-eo => -eos | +-io => -ios | +-oo => -oos | +-uo => -uos | +-alf => | -alves +-elf => | -elves +-olf => | -olves +-anx => -anxes | -anges +-inx => -inxes | -inges +-ynx => -ynxes | -ynges -arf => | -arves -ceps => -ceps | -ch => -ches | @@ -2750,9 +2760,14 @@ its => theirs -oe => -oes | -o => -oes | -quy => -quies | --[aeiou]y => -[aeiou]ys | +-ay => -ays | +-ey => -eys | +-iy => -iys | +-oy => -oys | +-uy => -uys | -ss => -sses | --[^s]sis => | -[^s]ses +-sis => | -ses +#-[^s]sis => | -[^s]ses -trix => -trixes | -trices -us => -uses | -x => -xes | @@ -1,29 +1,49 @@ -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <groupId>inflexion</groupId>
- <artifactId>inflexion</artifactId>
- <version>0.0.1-SNAPSHOT</version>
- <name>Inflexion</name>
- <description>Java based implementation of Damian Conway's pluralization algorithm.</description>
-
- <build>
- <plugins>
- <plugin>
- <artifactId>maven-compiler-plugin</artifactId>
- <configuration>
- <source>1.8</source>
- <target>1.8</target>
- </configuration>
- </plugin>
- </plugins>
- <resources>
- <resource>
- <directory>data/</directory>
- <includes>
- <include>**/*.txt</include>
- </includes>
- </resource>
- </resources>
- </build>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <groupId>bjc</groupId> + <artifactId>inflexion</artifactId> + <version>0.0.1-SNAPSHOT</version> + <name>Inflexion</name> + <description>Java based implementation of Damian Conway's Lingua::EN::Inflexion module for perl</description> + + <licenses> + <license> + <name>Apache License, Version 2.0</name> + <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> + <distribution>repo</distribution> + </license> + </licenses> + + <scm> + <connection>scm:git:https://github.com/bculkin2442/Inflexion.git</connection> + <url>https://github.com/atteo/Inflexion</url> + </scm> + + <build> + <plugins> + <plugin> + <artifactId>maven-compiler-plugin</artifactId> + <configuration> + <source>1.8</source> + <target>1.8</target> + </configuration> + </plugin> + </plugins> + <resources> + <resource> + <directory>data/</directory> + <includes> + <include>**/*.txt</include> + </includes> + </resource> + </resources> + </build> + <dependencies> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-compress</artifactId> + <version>1.13</version> + </dependency> + </dependencies> </project>
\ No newline at end of file diff --git a/src/examples/java/bjc/inflexion/InflexionTester.java b/src/examples/java/bjc/inflexion/InflexionTester.java index 5f95de7..a11d168 100644 --- a/src/examples/java/bjc/inflexion/InflexionTester.java +++ b/src/examples/java/bjc/inflexion/InflexionTester.java @@ -19,7 +19,19 @@ import bjc.inflexion.v2.Noun; import bjc.inflexion.v2.Nouns; import bjc.inflexion.v2.Prepositions; +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; import java.util.Scanner; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; /** * Test inflecting words. @@ -46,7 +58,9 @@ public class InflexionTester { Scanner scn = new Scanner(System.in); - System.out.print("Enter a noun to inflect (blank line to quit): "); + wikitest(scn, nounDB); + + /*System.out.print("Enter a noun to inflect (blank line to quit): "); String ln = scn.nextLine().trim(); while(!ln.equals("")) { @@ -63,8 +77,168 @@ public class InflexionTester { System.out.print("Enter a noun to inflect (blank line to quit): "); ln = scn.nextLine().trim(); - } + }*/ scn.close(); } + + @SuppressWarnings("unused") + private static void wikitest(Scanner scn, Nouns nounDB) { + System.out.print("Enter name of dump file: "); + + String fname = scn.nextLine().trim(); + + try(InputStream compressedStream = new FileInputStream(fname)) { + InputStream stream = new BZip2CompressorInputStream(compressedStream); + BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); + + /* + * Pattern find word name + */ + Pattern titlePattern = Pattern.compile("<title>([^<]+)</title>"); + /* + * Pattern to find beginning of wiki text + */ + Pattern textPattern = Pattern.compile("<text"); + /* + * Pattern to find rank definition + */ + Pattern rankPattern = Pattern.compile("\\{\\{rank"); + /* + * Pattern to find noun definition + */ + Pattern enNounPattern = Pattern.compile("\\{\\{en-noun([a-z0-9\\|\\-\\[\\]\\?\\!=]*)\\}\\}"); + + Pattern wordPattern = Pattern.compile("([a-zA-Z\\-]+)"); + + String line; + String word = ""; + int text = 0; + int count = 0; + int basicCount = 0; + int wrong = 0; + int basicWrong = 0; + int wrongNoPlural = 0; + int wrongUncountable = 0; + boolean basicWord = false; + while((line = reader.readLine()) != null) { + Matcher titleMatcher = titlePattern.matcher(line); + if(titleMatcher.find()) { + word = titleMatcher.group(1); + if(word.startsWith("Wiktionary:")) { + continue; + } + basicWord = false; + text = 0; + continue; + } + Matcher textMatcher = textPattern.matcher(line); + if(textMatcher.find()) { + text++; + continue; + } + Matcher rankMatcher = rankPattern.matcher(line); + if(rankMatcher.find()) { + basicWord = true; + basicCount++; + } + if(text != 1) { + continue; + } + Matcher enNounMatcher = enNounPattern.matcher(line); + if(enNounMatcher.find()) { + // only first + /* + * if (text != 1) { continue; } + */ + text++; + count++; + if(count % 5000 == 0) { + System.out.println(count); + } + String[] rules = enNounMatcher.group(1).split("\\|"); + List<String> plurals = new ArrayList<>(); + + boolean uncountable = false; + boolean noPlural = false; + for(String rule : rules) { + if(rule.isEmpty()) { + continue; + } + if("-".equals(rule)) { + plurals.add(word); + uncountable = true; + } else if("s".equals(rule)) { + plurals.add(word + "s"); + } else if("es".equals(rule)) { + plurals.add(word + "es"); + } else if("!".equals(rule)) { + plurals.add("plural not attested"); + uncountable = true; + } else if("?".equals(rule)) { + plurals.add("unknown"); + noPlural = true; + } else { + Matcher matcher = wordPattern.matcher(rule); + if(matcher.matches()) { + plurals.add(rule); + } + } + } + if(plurals.isEmpty()) { + plurals.add(word + "s"); + } + + String calculatedPlural = nounDB.getNoun(word).plural(); + boolean ok = false; + for(String plural : plurals) { + if(plural.equals(calculatedPlural)) { + ok = true; + break; + } + } + + if(!ok) { + wrong++; + if(uncountable) { + wrongUncountable++; + } else if(noPlural) { + wrongNoPlural++; + } + if(basicWord) { + System.out.println("basic word: " + word + " got: " + + calculatedPlural + ", but expected " + + enNounMatcher.group(1)); + basicWrong++; + } else { + System.out.println(word + " got: " + calculatedPlural + + ", but expected " + enNounMatcher.group(1)); + } + } + } + } + reader.close(); + compressedStream.close(); + + float correct = (count - wrong) * 100 / (float) count; + float basicCorrect = (basicCount - basicWrong) * 100 / (float) basicCount; + float wrongUncountablePercent = wrongUncountable * 100 / (float) count; + float wrongNoPluralPercent = wrongNoPlural * 100 / (float) count; + int justPlainWrong = wrong - wrongUncountable - wrongNoPlural; + float justPlainWrongPercent = justPlainWrong * 100 / (float) count; + System.out.println("Words checked: " + count + " (" + basicCount + " basic words)"); + System.out.println("Correct: " + correct + "% (" + basicCorrect + "% basic words)"); + System.out.println("Errors: "); + System.out.println( + " Uncountable: " + wrongUncountable + " (" + wrongUncountablePercent + "%)"); + System.out.println(" No plural form specified: " + wrongNoPlural + " (" + + wrongNoPluralPercent + "%)"); + System.out.println(" Incorrect answer: " + justPlainWrong + " (" + justPlainWrongPercent + + "%)"); + } catch(FileNotFoundException fnfex) { + fnfex.printStackTrace(); + } catch(IOException ioex) { + ioex.printStackTrace(); + } + } }
\ No newline at end of file |
