From a883e7d100c54451fb9256cb3867c2571ee4fff1 Mon Sep 17 00:00:00 2001 From: "Benjamin J. Culkin" Date: Sun, 16 Sep 2018 22:27:15 -0300 Subject: Update --- src/main/java/bjc/inflexion/EnglishUtils.java | 156 ++++++++++++--------- .../inflexion/nouns/CategoricalNounInflection.java | 1 + .../java/bjc/inflexion/nouns/NounInflection.java | 1 + 3 files changed, 89 insertions(+), 69 deletions(-) (limited to 'src/main/java') diff --git a/src/main/java/bjc/inflexion/EnglishUtils.java b/src/main/java/bjc/inflexion/EnglishUtils.java index e233018..197b7cf 100644 --- a/src/main/java/bjc/inflexion/EnglishUtils.java +++ b/src/main/java/bjc/inflexion/EnglishUtils.java @@ -13,6 +13,7 @@ */ package bjc.inflexion; +import java.util.regex.Matcher; import java.util.regex.Pattern; /** @@ -28,16 +29,16 @@ public class EnglishUtils { private static String[] summaryNums = new String[] { "no", "one", "a couple of", "a few", "several" }; private static int[] summaryMap = new int[] { - /* no */ - 0, - /* one */ - 1, - /* a couple of */ - 2, - /* a few */ - 3, 3, 3, - /* several */ - 4, 4, 4, 4 + /* no */ + 0, + /* one */ + 1, + /* a couple of */ + 2, + /* a few */ + 3, 3, 3, + /* several */ + 4, 4, 4, 4 }; /** @@ -73,67 +74,84 @@ public class EnglishUtils { return "many"; } - private static Pattern AN_ORD = Pattern.compile("(?i)\\A[aefhilmnorsx]-?th\\Z"); - private static Pattern A_ORD = Pattern.compile("(?i)\\A[bcdgjkpqtuvwyz]-?th\\Z"); - private static Pattern EXP_AN = Pattern.compile("(?i)\\A(?:euler|hour(?!i)|heir|honest|hono)"); - private static Pattern SIN_AN = Pattern.compile("(?i)\\A[aefhilmnorst]\\Z"); - private static Pattern SIN_A = Pattern.compile("(?i)\\A[bcdgjkpqtuvwyz]\\Z"); - private static Pattern ABBREV_AN = Pattern.compile("\\A(?!FJO|[HLMNS]Y|RY[EQ]|SQU|(F[LR]?|[HL]|MN?|N|RH?|S[CHKLMNPTVW]?|X(YL)?)[AEIOU])[FHLMNRSX][A-Z]"); + public static String pickIndefinite(String phrase) { + Pattern pattern; + Matcher matcher; + String word, lowercaseWord; + + if (phrase.length() == 0) { + return "a"; + } + + // Getting the first word + pattern = Pattern.compile("(\\w+)\\s*.*"); + matcher = pattern.matcher(phrase); + if(matcher.matches() == true) { + word = matcher.group(1); + } else { + return "an"; + } + + lowercaseWord = word.toLowerCase(); + + // Specific start of words that should be preceded by 'an' + String [] altCases = { "euler", "heir", "honest", "hono" }; + for (String altCase : altCases) { + if (lowercaseWord.startsWith(altCase) == true) { + return "an"; + } + } + + if (lowercaseWord.startsWith("hour") == true && lowercaseWord.startsWith("houri") == false) { + return "an"; + } + + + // Single letter word which should be preceded by 'an' + if (lowercaseWord.length() == 1) { + if ("aedhilmnorsx".indexOf(lowercaseWord) >= 0) { + return "an"; + } else { + return "a"; + } + } + + // Capital words which should likely be preceded by 'an' + if (word.matches("(?!FJO|[HLMNS]Y.|RY[EO]|SQU|(F[LR]?|[HL]|MN?|N|RH?|S[CHKLMNPTVW]?|X(YL)?)[AEIOU])[FHLMNRSX][A-Z]")) { + return "an"; + } + + // Special cases where a word that begins with a vowel should be preceded by 'a' + String [] regexes = { "^e[uw]", "^onc?e\\b", "^uni([^nmd]|mo)", "^u[bcfhjkqrst][aeiou]" }; + + for (String regex : regexes) { + if (lowercaseWord.matches(regex+".*") == true) { + return "a"; + } + } + + // Special capital words (UK, UN) + if (word.matches("^U[NK][AIEO].*") == true) { + return "a"; + } else if (word == word.toUpperCase()) { + if ("aedhilmnorsx".indexOf(lowercaseWord.substring(0, 1)) >= 0) { + return "an"; + } else { + return "a"; + } + } + + // Basic method of words that begin with a vowel being preceded by 'an' + if ("aeiou".indexOf(lowercaseWord.substring(0, 1)) >= 0) { + return "an"; + } + + // Instances where y followed by specific letters is preceded by 'an' + if (lowercaseWord.matches("^y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt).*")) { + return "an"; + } - private static Pattern IN_Y_AN = Pattern.compile("(?i)\\Ay(?:b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)"); - - private static Pattern ABBREV_C2 = Pattern.compile("(?i)\\A[aefhilmnorsx][.-]"); - private static Pattern ABBREV_C3 = Pattern.compile("(?i)\\A[a-z][.-]"); - - private static Pattern CONSONANT = Pattern.compile("(?i)\\A[^aeiouy]"); - - private static Pattern SPECVOWEL_C1 = Pattern.compile("(?i)\\Ae[uw]"); - private static Pattern SPECVOWEL_C2 = Pattern.compile("(?i)\\Aonc?e\b"); - private static Pattern SPECVOWEL_C3 = Pattern.compile("(?i)\\Auni(?:[^nmd]|mo)"); - private static Pattern SPECVOWEL_C4 = Pattern.compile("(?i)\\Aut[th]"); - private static Pattern SPECVOWEL_C5 = Pattern.compile("(?i)\\Au[bcfhjkqrst][aeiou]"); - - private static Pattern SPECCAP_C1 = Pattern.compile("\\AU[NK][AIEO]?"); - - private static Pattern VOWEL = Pattern.compile("(?i)\\A[aeiou]\\Z"); - - public static String pickIndefinite(String word) { - // Handle ordinal forms - if(A_ORD.matcher(word).find()) return "a"; - if(AN_ORD.matcher(word).find()) return "an"; - - // Handle special cases - if(EXP_AN.matcher(word).find()) return "an"; - if(SIN_AN.matcher(word).find()) return "an"; - if(SIN_A.matcher(word).find()) return "a"; - - // Handle abbreviations - if(ABBREV_AN.matcher(word).find()) return "an"; - if(ABBREV_C2.matcher(word).find()) return "an"; - if(ABBREV_C3.matcher(word).find()) return "a"; - - // Handle consonants - if(CONSONANT.matcher(word).find()) return "a"; - - // Handle special vowel forms - if(SPECVOWEL_C1.matcher(word).find()) return "a"; - if(SPECVOWEL_C2.matcher(word).find()) return "a"; - if(SPECVOWEL_C3.matcher(word).find()) return "a"; - if(SPECVOWEL_C4.matcher(word).find()) return "an"; - if(SPECVOWEL_C5.matcher(word).find()) return "a"; - - // Handle special capitals - if(SPECCAP_C1.matcher(word).find()) return "a"; - - // Handle vowels - if(VOWEL.matcher(word).find()) return "an"; - - // Handle Y (before certain consonants, it implies a - // (unnaturalized) "I" sound) - if(IN_Y_AN.matcher(word).find()) return "an"; - - // Guess "A" return "a"; } } diff --git a/src/main/java/bjc/inflexion/nouns/CategoricalNounInflection.java b/src/main/java/bjc/inflexion/nouns/CategoricalNounInflection.java index fee0b33..9fafcff 100644 --- a/src/main/java/bjc/inflexion/nouns/CategoricalNounInflection.java +++ b/src/main/java/bjc/inflexion/nouns/CategoricalNounInflection.java @@ -1,4 +1,5 @@ /* + * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/bjc/inflexion/nouns/NounInflection.java b/src/main/java/bjc/inflexion/nouns/NounInflection.java index 94161c5..978efdb 100644 --- a/src/main/java/bjc/inflexion/nouns/NounInflection.java +++ b/src/main/java/bjc/inflexion/nouns/NounInflection.java @@ -1,4 +1,5 @@ /** + * (C) Copyright 2017 Benjamin Culkin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. -- cgit v1.2.3