diff options
| author | Benjamin J. Culkin <bjculkin@mix.wvu.edu> | 2018-06-07 16:26:46 -0300 |
|---|---|---|
| committer | Benjamin J. Culkin <bjculkin@mix.wvu.edu> | 2018-06-07 16:26:46 -0300 |
| commit | 7f16ae0286ab7492eee9f4019d976bc5ca95d556 (patch) | |
| tree | 811d0f5b4713260b5a9ffb9ab64d676c3746dbc3 | |
| parent | 235208946ceb2bf0f422956a3ebc0ebb88ba28b6 (diff) | |
Indefinites
19 files changed, 216 insertions, 36 deletions
@@ -1,2 +1,3 @@ # inflexion -Java implementation of Damian Conway's pluralization algorithm + +Java implementation of Damian Conway's Lingua::EN::Inflexion algorithm diff --git a/data/nouns.txt b/data/nouns.txt index fca5b39..3325daf 100644 --- a/data/nouns.txt +++ b/data/nouns.txt @@ -560,6 +560,7 @@ topos => topoi absis => | absides aphis => | aphides apsis => | apsides +aspis => | aspides caryopsis => | caryopsides chrysalis => chrysalises | chrysalides cinclis => | cinclides diff --git a/indefinite.txt b/indefinite.txt new file mode 100644 index 0000000..dcf2997 --- /dev/null +++ b/indefinite.txt @@ -0,0 +1,75 @@ +# Special cases of A/AN... +my $ORDINAL_AN = qr{\A [aefhilmnorsx] -?th \Z}ix; +my $ORDINAL_A = qr{\A [bcdgjkpqtuvwyz] -?th \Z}ix; +my $EXPLICIT_AN = qr{\A (?: euler | hour(?!i) | heir | honest | hono )}ix; +my $SINGLE_AN = qr{\A [aefhilmnorsx] \Z}ix; +my $SINGLE_A = qr{\A [bcdgjkpqtuvwyz] \Z}ix; + +# This pattern matches strings of capitals (i.e. abbreviations) that +# start with a "vowel-sound" consonant followed by another consonant, +# and which are not likely to be real words +# (oh, all right then, it's just magic!)... + +my $ABBREV_AN = qr{ + \A + (?! FJO | [HLMNS]Y. | RY[EO] | SQU + | ( F[LR]? | [HL] | MN? | N | RH? | S[CHKLMNPTVW]? | X(YL)?) [AEIOU] + ) + [FHLMNRSX][A-Z] +}xms; + +# This pattern codes the beginnings of all english words begining with a +# 'Y' followed by a consonant. Any other Y-consonant prefix therefore +# implies an abbreviation... + +my $INITIAL_Y_AN = qr{\A y (?: b[lor] | cl[ea] | fere | gg | p[ios] | rou | tt)}xi; + + + + + +sub select_indefinite_article { + my ($word) = @_; + + # Handle ordinal forms... + return "a" if $word =~ $ORDINAL_A; + return "an" if $word =~ $ORDINAL_AN; + + # Handle special cases... + return "an" if $word =~ $EXPLICIT_AN; + return "an" if $word =~ $SINGLE_AN; + return "a" if $word =~ $SINGLE_A; + + # Handle abbreviations... + return "an" if $word =~ $ABBREV_AN; + return "an" if $word =~ /\A [aefhilmnorsx][.-]/xi; + return "a" if $word =~ /\A [a-z][.-]/xi; + + # Handle consonants + + return "a" if $word =~ /\A [^aeiouy] /xi; + + # Handle special vowel-forms + + return "a" if $word =~ /\A e [uw] /xi; + return "a" if $word =~ /\A onc?e \b /xi; + return "a" if $word =~ /\A uni (?: [^nmd] | mo) /xi; + return "an" if $word =~ /\A ut[th] /xi; + return "a" if $word =~ /\A u [bcfhjkqrst] [aeiou] /xi; + + # Handle special capitals + + return "a" if $word =~ /\A U [NK] [AIEO]? /x; + + # Handle vowels + + return "an" if $word =~ /\A [aeiou]/xi; + + # Handle Y... (before certain consonants implies (unnaturalized) "I.." sound) + return "an" if $word =~ $INITIAL_Y_AN; + + # Otherwise, guess "A" + return "a"; +} + + @@ -5,13 +5,13 @@ <groupId>bjc</groupId> <artifactId>inflexion</artifactId> - <version>0.0.1-SNAPSHOT</version> + <version>1.0.0</version> <name>Inflexion</name> <description>Java based implementation of Damian Conway's Lingua::EN::Inflexion module for perl</description> <properties> - <main.class>bjc.inflexion.examples.InflexionTester</main.class> + <main.class>bjc.inflexion.examples.IndefTester</main.class> </properties> <licenses> diff --git a/src/examples/java/bjc/inflexion/examples/IndefTester.java b/src/examples/java/bjc/inflexion/examples/IndefTester.java new file mode 100644 index 0000000..fd1929c --- /dev/null +++ b/src/examples/java/bjc/inflexion/examples/IndefTester.java @@ -0,0 +1,23 @@ +package bjc.inflexion.examples; + +import java.util.Scanner; + +import bjc.inflexion.EnglishUtils; + +public class IndefTester { + public static void main(String[] args) { + Scanner scn = new Scanner(System.in); + + System.out.print("Enter word: "); + String word = scn.nextLine().trim(); + + while(!word.equals("")) { + System.out.printf("\t%s %s\n", EnglishUtils.pickIndefinite(word), word); + + System.out.print("Enter word: "); + word = scn.nextLine().trim(); + } + + scn.close(); + } +} diff --git a/src/main/java/bjc/inflexion/EnglishUtils.java b/src/main/java/bjc/inflexion/EnglishUtils.java index 28fc6c6..e233018 100644 --- a/src/main/java/bjc/inflexion/EnglishUtils.java +++ b/src/main/java/bjc/inflexion/EnglishUtils.java @@ -1,6 +1,4 @@ -/** - * (C) Copyright 2017 Benjamin Culkin. - * +/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -15,6 +13,8 @@ */ package bjc.inflexion; +import java.util.regex.Pattern; + /** * General utils for dealing with english. * @@ -72,4 +72,68 @@ public class EnglishUtils { return "many"; } + + private static Pattern AN_ORD = Pattern.compile("(?i)\\A[aefhilmnorsx]-?th\\Z"); + private static Pattern A_ORD = Pattern.compile("(?i)\\A[bcdgjkpqtuvwyz]-?th\\Z"); + private static Pattern EXP_AN = Pattern.compile("(?i)\\A(?:euler|hour(?!i)|heir|honest|hono)"); + private static Pattern SIN_AN = Pattern.compile("(?i)\\A[aefhilmnorst]\\Z"); + private static Pattern SIN_A = Pattern.compile("(?i)\\A[bcdgjkpqtuvwyz]\\Z"); + + private static Pattern ABBREV_AN = Pattern.compile("\\A(?!FJO|[HLMNS]Y|RY[EQ]|SQU|(F[LR]?|[HL]|MN?|N|RH?|S[CHKLMNPTVW]?|X(YL)?)[AEIOU])[FHLMNRSX][A-Z]"); + + private static Pattern IN_Y_AN = Pattern.compile("(?i)\\Ay(?:b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)"); + + private static Pattern ABBREV_C2 = Pattern.compile("(?i)\\A[aefhilmnorsx][.-]"); + private static Pattern ABBREV_C3 = Pattern.compile("(?i)\\A[a-z][.-]"); + + private static Pattern CONSONANT = Pattern.compile("(?i)\\A[^aeiouy]"); + + private static Pattern SPECVOWEL_C1 = Pattern.compile("(?i)\\Ae[uw]"); + private static Pattern SPECVOWEL_C2 = Pattern.compile("(?i)\\Aonc?e\b"); + private static Pattern SPECVOWEL_C3 = Pattern.compile("(?i)\\Auni(?:[^nmd]|mo)"); + private static Pattern SPECVOWEL_C4 = Pattern.compile("(?i)\\Aut[th]"); + private static Pattern SPECVOWEL_C5 = Pattern.compile("(?i)\\Au[bcfhjkqrst][aeiou]"); + + private static Pattern SPECCAP_C1 = Pattern.compile("\\AU[NK][AIEO]?"); + + private static Pattern VOWEL = Pattern.compile("(?i)\\A[aeiou]\\Z"); + + public static String pickIndefinite(String word) { + // Handle ordinal forms + if(A_ORD.matcher(word).find()) return "a"; + if(AN_ORD.matcher(word).find()) return "an"; + + // Handle special cases + if(EXP_AN.matcher(word).find()) return "an"; + if(SIN_AN.matcher(word).find()) return "an"; + if(SIN_A.matcher(word).find()) return "a"; + + // Handle abbreviations + if(ABBREV_AN.matcher(word).find()) return "an"; + if(ABBREV_C2.matcher(word).find()) return "an"; + if(ABBREV_C3.matcher(word).find()) return "a"; + + // Handle consonants + if(CONSONANT.matcher(word).find()) return "a"; + + // Handle special vowel forms + if(SPECVOWEL_C1.matcher(word).find()) return "a"; + if(SPECVOWEL_C2.matcher(word).find()) return "a"; + if(SPECVOWEL_C3.matcher(word).find()) return "a"; + if(SPECVOWEL_C4.matcher(word).find()) return "an"; + if(SPECVOWEL_C5.matcher(word).find()) return "a"; + + // Handle special capitals + if(SPECCAP_C1.matcher(word).find()) return "a"; + + // Handle vowels + if(VOWEL.matcher(word).find()) return "an"; + + // Handle Y (before certain consonants, it implies a + // (unnaturalized) "I" sound) + if(IN_Y_AN.matcher(word).find()) return "an"; + + // Guess "A" + return "a"; + } } diff --git a/src/main/java/bjc/inflexion/InflectionML.java b/src/main/java/bjc/inflexion/InflectionML.java index 939e96e..9ee175d 100644 --- a/src/main/java/bjc/inflexion/InflectionML.java +++ b/src/main/java/bjc/inflexion/InflectionML.java @@ -1,6 +1,4 @@ -/** - * (C) Copyright 2017 Benjamin Culkin. - * +/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -15,9 +13,11 @@ */ package bjc.inflexion; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; +import java.util.Iterator; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -45,6 +45,8 @@ public class InflectionML { private static Pattern FORM_MARKER = Pattern.compile("<(?<command>[#N])(?<options>[^:]*):(?<text>[^>]*)>"); + private static Pattern AN_MARKER = Pattern.compile("\\{an(\\d+)\\}"); + /* The database of nouns. */ private static Nouns nounDB; @@ -66,14 +68,19 @@ public class InflectionML { * @return * The inflected string. */ - public static String inflect(final String form) { - final Matcher formMatcher = FORM_MARKER.matcher(form); - - final StringBuffer formBuffer = new StringBuffer(); + public static String inflect(String form) { + Matcher formMatcher = FORM_MARKER.matcher(form); + StringBuffer formBuffer = new StringBuffer(); int curCount = 1; + boolean inflectSingular = true; + int anCount = 0; + List<String> anVals = new ArrayList<>(); + + boolean pendingAN = false; + while (formMatcher.find()) { final String command = formMatcher.group("command"); final String options = formMatcher.group("options"); @@ -129,9 +136,12 @@ public class InflectionML { } if (optionSet.contains("a")) { - /* :InflectionML - * Implement a/an for nouns. - */ + if (curCount == 1) { + anCount += 1; + rep = "{an" + anCount + "}"; + + pendingAN = true; + } } /* Break out of switch. */ @@ -140,10 +150,7 @@ public class InflectionML { break; } - final boolean shouldOverride = - !(rep.equals("no") || - rep.equals("a") || - rep.equals("an") ); + final boolean shouldOverride = !(rep.equals("no") || rep.matches("\\{an\\d+\\}")); if (optionSet.contains("w") && shouldOverride) { rep = EnglishUtils.smallIntToWord(curCount); @@ -162,15 +169,25 @@ public class InflectionML { case "N": final Noun noun = nounDB.getNoun(text); + String nounVal; + if (optionSet.contains("p") || !inflectSingular) { if (optionSet.contains("c")) { - formMatcher.appendReplacement(formBuffer, noun.classicalPlural()); + nounVal = noun.classicalPlural(); } else { - formMatcher.appendReplacement(formBuffer, noun.modernPlural()); + nounVal = noun.modernPlural(); } } else { - formMatcher.appendReplacement(formBuffer, noun.singular()); + nounVal = noun.singular(); } + + formMatcher.appendReplacement(formBuffer, nounVal); + if(pendingAN) { + anVals.add(EnglishUtils.pickIndefinite(nounVal)); + + pendingAN = false; + } + break; default: final String msg = String.format("Unknown command '%s'", command); @@ -181,6 +198,17 @@ public class InflectionML { formMatcher.appendTail(formBuffer); + String res = formBuffer.toString(); + formBuffer = new StringBuffer(); + + Matcher anMat = AN_MARKER.matcher(res); + + Iterator<String> anItr = anVals.iterator(); + while(anMat.find()) { + anMat.appendReplacement(formBuffer, anItr.next()); + } + anMat.appendTail(formBuffer); + return formBuffer.toString(); } diff --git a/src/main/java/bjc/inflexion/nouns/CategoricalNounInflection.java b/src/main/java/bjc/inflexion/nouns/CategoricalNounInflection.java index 1371ab3..fee0b33 100644 --- a/src/main/java/bjc/inflexion/nouns/CategoricalNounInflection.java +++ b/src/main/java/bjc/inflexion/nouns/CategoricalNounInflection.java @@ -1,5 +1,4 @@ -/** - * (C) Copyright 2017 Benjamin Culkin. +/* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/bjc/inflexion/nouns/CompoundNounInflection.java b/src/main/java/bjc/inflexion/nouns/CompoundNounInflection.java index 6edcb54..bd36202 100644 --- a/src/main/java/bjc/inflexion/nouns/CompoundNounInflection.java +++ b/src/main/java/bjc/inflexion/nouns/CompoundNounInflection.java @@ -1,5 +1,4 @@ /** - * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/bjc/inflexion/nouns/DefaultNounInflection.java b/src/main/java/bjc/inflexion/nouns/DefaultNounInflection.java index e982bc9..570aa25 100644 --- a/src/main/java/bjc/inflexion/nouns/DefaultNounInflection.java +++ b/src/main/java/bjc/inflexion/nouns/DefaultNounInflection.java @@ -1,5 +1,4 @@ /** - * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/bjc/inflexion/nouns/InflectionAffix.java b/src/main/java/bjc/inflexion/nouns/InflectionAffix.java index d224340..65c6500 100644 --- a/src/main/java/bjc/inflexion/nouns/InflectionAffix.java +++ b/src/main/java/bjc/inflexion/nouns/InflectionAffix.java @@ -1,5 +1,4 @@ /** - * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/bjc/inflexion/nouns/InflectionAffixes.java b/src/main/java/bjc/inflexion/nouns/InflectionAffixes.java index facf9d0..645e73a 100644 --- a/src/main/java/bjc/inflexion/nouns/InflectionAffixes.java +++ b/src/main/java/bjc/inflexion/nouns/InflectionAffixes.java @@ -1,5 +1,4 @@ /** - * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/bjc/inflexion/nouns/InflectionException.java b/src/main/java/bjc/inflexion/nouns/InflectionException.java index 56715ff..74a88b6 100644 --- a/src/main/java/bjc/inflexion/nouns/InflectionException.java +++ b/src/main/java/bjc/inflexion/nouns/InflectionException.java @@ -1,5 +1,4 @@ /** - * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/bjc/inflexion/nouns/IrregularNounInflection.java b/src/main/java/bjc/inflexion/nouns/IrregularNounInflection.java index 471a99e..b336e85 100644 --- a/src/main/java/bjc/inflexion/nouns/IrregularNounInflection.java +++ b/src/main/java/bjc/inflexion/nouns/IrregularNounInflection.java @@ -1,5 +1,4 @@ /** - * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/bjc/inflexion/nouns/Noun.java b/src/main/java/bjc/inflexion/nouns/Noun.java index f94e0bc..cd7c855 100644 --- a/src/main/java/bjc/inflexion/nouns/Noun.java +++ b/src/main/java/bjc/inflexion/nouns/Noun.java @@ -1,5 +1,4 @@ /** - * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/bjc/inflexion/nouns/NounInflection.java b/src/main/java/bjc/inflexion/nouns/NounInflection.java index 978efdb..94161c5 100644 --- a/src/main/java/bjc/inflexion/nouns/NounInflection.java +++ b/src/main/java/bjc/inflexion/nouns/NounInflection.java @@ -1,5 +1,4 @@ /** - * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/bjc/inflexion/nouns/Nouns.java b/src/main/java/bjc/inflexion/nouns/Nouns.java index aeb2f2f..6a36752 100644 --- a/src/main/java/bjc/inflexion/nouns/Nouns.java +++ b/src/main/java/bjc/inflexion/nouns/Nouns.java @@ -1,5 +1,4 @@ /** - * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/bjc/inflexion/nouns/Prepositions.java b/src/main/java/bjc/inflexion/nouns/Prepositions.java index 0d36c7e..9564baf 100644 --- a/src/main/java/bjc/inflexion/nouns/Prepositions.java +++ b/src/main/java/bjc/inflexion/nouns/Prepositions.java @@ -1,5 +1,4 @@ /** - * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/bjc/inflexion/nouns/SimpleInflectionAffix.java b/src/main/java/bjc/inflexion/nouns/SimpleInflectionAffix.java index 87991b5..93a22e6 100644 --- a/src/main/java/bjc/inflexion/nouns/SimpleInflectionAffix.java +++ b/src/main/java/bjc/inflexion/nouns/SimpleInflectionAffix.java @@ -1,5 +1,4 @@ /** - * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. |
