diff options
| author | bjculkin <bjculkin@mix.wvu.edu> | 2017-04-04 18:51:52 -0400 |
|---|---|---|
| committer | bjculkin <bjculkin@mix.wvu.edu> | 2017-04-04 18:51:52 -0400 |
| commit | 7b56beefe4df24acd8437bf42262e7cd30d43970 (patch) | |
| tree | 0f8a4e98897ba9b42fc705ca39a21eed44943941 /src/examples | |
| parent | e4aa727c5ae37ef4d8df45ef535719f626b10917 (diff) | |
Add a test
Plus, more plural fixes
Diffstat (limited to 'src/examples')
| -rw-r--r-- | src/examples/java/bjc/inflexion/InflexionTester.java | 178 |
1 files changed, 176 insertions, 2 deletions
diff --git a/src/examples/java/bjc/inflexion/InflexionTester.java b/src/examples/java/bjc/inflexion/InflexionTester.java index 5f95de7..a11d168 100644 --- a/src/examples/java/bjc/inflexion/InflexionTester.java +++ b/src/examples/java/bjc/inflexion/InflexionTester.java @@ -19,7 +19,19 @@ import bjc.inflexion.v2.Noun; import bjc.inflexion.v2.Nouns; import bjc.inflexion.v2.Prepositions; +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; import java.util.Scanner; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; /** * Test inflecting words. @@ -46,7 +58,9 @@ public class InflexionTester { Scanner scn = new Scanner(System.in); - System.out.print("Enter a noun to inflect (blank line to quit): "); + wikitest(scn, nounDB); + + /*System.out.print("Enter a noun to inflect (blank line to quit): "); String ln = scn.nextLine().trim(); while(!ln.equals("")) { @@ -63,8 +77,168 @@ public class InflexionTester { System.out.print("Enter a noun to inflect (blank line to quit): "); ln = scn.nextLine().trim(); - } + }*/ scn.close(); } + + @SuppressWarnings("unused") + private static void wikitest(Scanner scn, Nouns nounDB) { + System.out.print("Enter name of dump file: "); + + String fname = scn.nextLine().trim(); + + try(InputStream compressedStream = new FileInputStream(fname)) { + InputStream stream = new BZip2CompressorInputStream(compressedStream); + BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); + + /* + * Pattern find word name + */ + Pattern titlePattern = Pattern.compile("<title>([^<]+)</title>"); + /* + * Pattern to find beginning of wiki text + */ + Pattern textPattern = Pattern.compile("<text"); + /* + * Pattern to find rank definition + */ + Pattern rankPattern = Pattern.compile("\\{\\{rank"); + /* + * Pattern to find noun definition + */ + Pattern enNounPattern = Pattern.compile("\\{\\{en-noun([a-z0-9\\|\\-\\[\\]\\?\\!=]*)\\}\\}"); + + Pattern wordPattern = Pattern.compile("([a-zA-Z\\-]+)"); + + String line; + String word = ""; + int text = 0; + int count = 0; + int basicCount = 0; + int wrong = 0; + int basicWrong = 0; + int wrongNoPlural = 0; + int wrongUncountable = 0; + boolean basicWord = false; + while((line = reader.readLine()) != null) { + Matcher titleMatcher = titlePattern.matcher(line); + if(titleMatcher.find()) { + word = titleMatcher.group(1); + if(word.startsWith("Wiktionary:")) { + continue; + } + basicWord = false; + text = 0; + continue; + } + Matcher textMatcher = textPattern.matcher(line); + if(textMatcher.find()) { + text++; + continue; + } + Matcher rankMatcher = rankPattern.matcher(line); + if(rankMatcher.find()) { + basicWord = true; + basicCount++; + } + if(text != 1) { + continue; + } + Matcher enNounMatcher = enNounPattern.matcher(line); + if(enNounMatcher.find()) { + // only first + /* + * if (text != 1) { continue; } + */ + text++; + count++; + if(count % 5000 == 0) { + System.out.println(count); + } + String[] rules = enNounMatcher.group(1).split("\\|"); + List<String> plurals = new ArrayList<>(); + + boolean uncountable = false; + boolean noPlural = false; + for(String rule : rules) { + if(rule.isEmpty()) { + continue; + } + if("-".equals(rule)) { + plurals.add(word); + uncountable = true; + } else if("s".equals(rule)) { + plurals.add(word + "s"); + } else if("es".equals(rule)) { + plurals.add(word + "es"); + } else if("!".equals(rule)) { + plurals.add("plural not attested"); + uncountable = true; + } else if("?".equals(rule)) { + plurals.add("unknown"); + noPlural = true; + } else { + Matcher matcher = wordPattern.matcher(rule); + if(matcher.matches()) { + plurals.add(rule); + } + } + } + if(plurals.isEmpty()) { + plurals.add(word + "s"); + } + + String calculatedPlural = nounDB.getNoun(word).plural(); + boolean ok = false; + for(String plural : plurals) { + if(plural.equals(calculatedPlural)) { + ok = true; + break; + } + } + + if(!ok) { + wrong++; + if(uncountable) { + wrongUncountable++; + } else if(noPlural) { + wrongNoPlural++; + } + if(basicWord) { + System.out.println("basic word: " + word + " got: " + + calculatedPlural + ", but expected " + + enNounMatcher.group(1)); + basicWrong++; + } else { + System.out.println(word + " got: " + calculatedPlural + + ", but expected " + enNounMatcher.group(1)); + } + } + } + } + reader.close(); + compressedStream.close(); + + float correct = (count - wrong) * 100 / (float) count; + float basicCorrect = (basicCount - basicWrong) * 100 / (float) basicCount; + float wrongUncountablePercent = wrongUncountable * 100 / (float) count; + float wrongNoPluralPercent = wrongNoPlural * 100 / (float) count; + int justPlainWrong = wrong - wrongUncountable - wrongNoPlural; + float justPlainWrongPercent = justPlainWrong * 100 / (float) count; + System.out.println("Words checked: " + count + " (" + basicCount + " basic words)"); + System.out.println("Correct: " + correct + "% (" + basicCorrect + "% basic words)"); + System.out.println("Errors: "); + System.out.println( + " Uncountable: " + wrongUncountable + " (" + wrongUncountablePercent + "%)"); + System.out.println(" No plural form specified: " + wrongNoPlural + " (" + + wrongNoPluralPercent + "%)"); + System.out.println(" Incorrect answer: " + justPlainWrong + " (" + justPlainWrongPercent + + "%)"); + } catch(FileNotFoundException fnfex) { + fnfex.printStackTrace(); + } catch(IOException ioex) { + ioex.printStackTrace(); + } + } }
\ No newline at end of file |
