summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--data/nouns.txt27
-rw-r--r--pom.xml76
-rw-r--r--src/examples/java/bjc/inflexion/InflexionTester.java178
3 files changed, 245 insertions, 36 deletions
diff --git a/data/nouns.txt b/data/nouns.txt
index 4122dc8..c257bd6 100644
--- a/data/nouns.txt
+++ b/data/nouns.txt
@@ -986,6 +986,7 @@ ascidium => | ascidia
asylum => asylums | asyla
avicularium => | avicularia
axopodium => | axopodia
+baculum => | bacula
bacterium => | bacteria
bifolium => | bifolia
caecum => | caeca
@@ -1321,7 +1322,7 @@ alumnus => | alumni
alveolus => | alveoli
aptychus => | aptychi
aureus => | aurei
-*bacillus => | *bacilli
+*bacillus => | *bacilli
bronchus => | bronchi
bulimus => | bulimi
cactus => cactuses | cacti
@@ -1961,6 +1962,7 @@ testudo => testudos
timpano => timpanos
tiro => tiros
tobacco => tobaccos
+todo => todos
Togo => Togos
Tokyo => Tokyos
torero => toreros
@@ -2736,9 +2738,17 @@ its => theirs
# Standard patterns of inflection for other nouns (in increasing order of generality)...
--[aeiou]o => -[aeiou]os |
--[aeo]lf => | -[aeo]lves
--[aiy]nx => -[aiy]nxes | -[aiy]nges
+-ao => -aos |
+-eo => -eos |
+-io => -ios |
+-oo => -oos |
+-uo => -uos |
+-alf => | -alves
+-elf => | -elves
+-olf => | -olves
+-anx => -anxes | -anges
+-inx => -inxes | -inges
+-ynx => -ynxes | -ynges
-arf => | -arves
-ceps => -ceps |
-ch => -ches |
@@ -2750,9 +2760,14 @@ its => theirs
-oe => -oes |
-o => -oes |
-quy => -quies |
--[aeiou]y => -[aeiou]ys |
+-ay => -ays |
+-ey => -eys |
+-iy => -iys |
+-oy => -oys |
+-uy => -uys |
-ss => -sses |
--[^s]sis => | -[^s]ses
+-sis => | -ses
+#-[^s]sis => | -[^s]ses
-trix => -trixes | -trices
-us => -uses |
-x => -xes |
diff --git a/pom.xml b/pom.xml
index b1364e4..d577778 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,29 +1,49 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <groupId>inflexion</groupId>
- <artifactId>inflexion</artifactId>
- <version>0.0.1-SNAPSHOT</version>
- <name>Inflexion</name>
- <description>Java based implementation of Damian Conway's pluralization algorithm.</description>
-
- <build>
- <plugins>
- <plugin>
- <artifactId>maven-compiler-plugin</artifactId>
- <configuration>
- <source>1.8</source>
- <target>1.8</target>
- </configuration>
- </plugin>
- </plugins>
- <resources>
- <resource>
- <directory>data/</directory>
- <includes>
- <include>**/*.txt</include>
- </includes>
- </resource>
- </resources>
- </build>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>bjc</groupId>
+ <artifactId>inflexion</artifactId>
+ <version>0.0.1-SNAPSHOT</version>
+ <name>Inflexion</name>
+ <description>Java based implementation of Damian Conway's Lingua::EN::Inflexion module for perl</description>
+
+ <licenses>
+ <license>
+ <name>Apache License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ <distribution>repo</distribution>
+ </license>
+ </licenses>
+
+ <scm>
+ <connection>scm:git:https://github.com/bculkin2442/Inflexion.git</connection>
+ <url>https://github.com/atteo/Inflexion</url>
+ </scm>
+
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <source>1.8</source>
+ <target>1.8</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ <resources>
+ <resource>
+ <directory>data/</directory>
+ <includes>
+ <include>**/*.txt</include>
+ </includes>
+ </resource>
+ </resources>
+ </build>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-compress</artifactId>
+ <version>1.13</version>
+ </dependency>
+ </dependencies>
</project> \ No newline at end of file
diff --git a/src/examples/java/bjc/inflexion/InflexionTester.java b/src/examples/java/bjc/inflexion/InflexionTester.java
index 5f95de7..a11d168 100644
--- a/src/examples/java/bjc/inflexion/InflexionTester.java
+++ b/src/examples/java/bjc/inflexion/InflexionTester.java
@@ -19,7 +19,19 @@ import bjc.inflexion.v2.Noun;
import bjc.inflexion.v2.Nouns;
import bjc.inflexion.v2.Prepositions;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
import java.util.Scanner;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
/**
* Test inflecting words.
@@ -46,7 +58,9 @@ public class InflexionTester {
Scanner scn = new Scanner(System.in);
- System.out.print("Enter a noun to inflect (blank line to quit): ");
+ wikitest(scn, nounDB);
+
+ /*System.out.print("Enter a noun to inflect (blank line to quit): ");
String ln = scn.nextLine().trim();
while(!ln.equals("")) {
@@ -63,8 +77,168 @@ public class InflexionTester {
System.out.print("Enter a noun to inflect (blank line to quit): ");
ln = scn.nextLine().trim();
- }
+ }*/
scn.close();
}
+
+ @SuppressWarnings("unused")
+ private static void wikitest(Scanner scn, Nouns nounDB) {
+ System.out.print("Enter name of dump file: ");
+
+ String fname = scn.nextLine().trim();
+
+ try(InputStream compressedStream = new FileInputStream(fname)) {
+ InputStream stream = new BZip2CompressorInputStream(compressedStream);
+ BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
+
+ /*
+ * Pattern find word name
+ */
+ Pattern titlePattern = Pattern.compile("<title>([^<]+)</title>");
+ /*
+ * Pattern to find beginning of wiki text
+ */
+ Pattern textPattern = Pattern.compile("<text");
+ /*
+ * Pattern to find rank definition
+ */
+ Pattern rankPattern = Pattern.compile("\\{\\{rank");
+ /*
+ * Pattern to find noun definition
+ */
+ Pattern enNounPattern = Pattern.compile("\\{\\{en-noun([a-z0-9\\|\\-\\[\\]\\?\\!=]*)\\}\\}");
+
+ Pattern wordPattern = Pattern.compile("([a-zA-Z\\-]+)");
+
+ String line;
+ String word = "";
+ int text = 0;
+ int count = 0;
+ int basicCount = 0;
+ int wrong = 0;
+ int basicWrong = 0;
+ int wrongNoPlural = 0;
+ int wrongUncountable = 0;
+ boolean basicWord = false;
+ while((line = reader.readLine()) != null) {
+ Matcher titleMatcher = titlePattern.matcher(line);
+ if(titleMatcher.find()) {
+ word = titleMatcher.group(1);
+ if(word.startsWith("Wiktionary:")) {
+ continue;
+ }
+ basicWord = false;
+ text = 0;
+ continue;
+ }
+ Matcher textMatcher = textPattern.matcher(line);
+ if(textMatcher.find()) {
+ text++;
+ continue;
+ }
+ Matcher rankMatcher = rankPattern.matcher(line);
+ if(rankMatcher.find()) {
+ basicWord = true;
+ basicCount++;
+ }
+ if(text != 1) {
+ continue;
+ }
+ Matcher enNounMatcher = enNounPattern.matcher(line);
+ if(enNounMatcher.find()) {
+ // only first
+ /*
+ * if (text != 1) { continue; }
+ */
+ text++;
+ count++;
+ if(count % 5000 == 0) {
+ System.out.println(count);
+ }
+ String[] rules = enNounMatcher.group(1).split("\\|");
+ List<String> plurals = new ArrayList<>();
+
+ boolean uncountable = false;
+ boolean noPlural = false;
+ for(String rule : rules) {
+ if(rule.isEmpty()) {
+ continue;
+ }
+ if("-".equals(rule)) {
+ plurals.add(word);
+ uncountable = true;
+ } else if("s".equals(rule)) {
+ plurals.add(word + "s");
+ } else if("es".equals(rule)) {
+ plurals.add(word + "es");
+ } else if("!".equals(rule)) {
+ plurals.add("plural not attested");
+ uncountable = true;
+ } else if("?".equals(rule)) {
+ plurals.add("unknown");
+ noPlural = true;
+ } else {
+ Matcher matcher = wordPattern.matcher(rule);
+ if(matcher.matches()) {
+ plurals.add(rule);
+ }
+ }
+ }
+ if(plurals.isEmpty()) {
+ plurals.add(word + "s");
+ }
+
+ String calculatedPlural = nounDB.getNoun(word).plural();
+ boolean ok = false;
+ for(String plural : plurals) {
+ if(plural.equals(calculatedPlural)) {
+ ok = true;
+ break;
+ }
+ }
+
+ if(!ok) {
+ wrong++;
+ if(uncountable) {
+ wrongUncountable++;
+ } else if(noPlural) {
+ wrongNoPlural++;
+ }
+ if(basicWord) {
+ System.out.println("basic word: " + word + " got: "
+ + calculatedPlural + ", but expected "
+ + enNounMatcher.group(1));
+ basicWrong++;
+ } else {
+ System.out.println(word + " got: " + calculatedPlural
+ + ", but expected " + enNounMatcher.group(1));
+ }
+ }
+ }
+ }
+ reader.close();
+ compressedStream.close();
+
+ float correct = (count - wrong) * 100 / (float) count;
+ float basicCorrect = (basicCount - basicWrong) * 100 / (float) basicCount;
+ float wrongUncountablePercent = wrongUncountable * 100 / (float) count;
+ float wrongNoPluralPercent = wrongNoPlural * 100 / (float) count;
+ int justPlainWrong = wrong - wrongUncountable - wrongNoPlural;
+ float justPlainWrongPercent = justPlainWrong * 100 / (float) count;
+ System.out.println("Words checked: " + count + " (" + basicCount + " basic words)");
+ System.out.println("Correct: " + correct + "% (" + basicCorrect + "% basic words)");
+ System.out.println("Errors: ");
+ System.out.println(
+ " Uncountable: " + wrongUncountable + " (" + wrongUncountablePercent + "%)");
+ System.out.println(" No plural form specified: " + wrongNoPlural + " ("
+ + wrongNoPluralPercent + "%)");
+ System.out.println(" Incorrect answer: " + justPlainWrong + " (" + justPlainWrongPercent
+ + "%)");
+ } catch(FileNotFoundException fnfex) {
+ fnfex.printStackTrace();
+ } catch(IOException ioex) {
+ ioex.printStackTrace();
+ }
+ }
} \ No newline at end of file