diff options
Diffstat (limited to 'BJC-Utils2/src/main/java/bjc/utils')
| -rw-r--r-- | BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java | 112 | ||||
| -rw-r--r-- | BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java | 135 |
2 files changed, 239 insertions, 8 deletions
diff --git a/BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java new file mode 100644 index 0000000..fd4b130 --- /dev/null +++ b/BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java @@ -0,0 +1,112 @@ +package bjc.utils.funcutils; + +import java.util.regex.Pattern; + +/** + * Split a string and keep given delimiters. + * + * @author Ben Culkin + */ +public class NeoTokenSplitter { + /* + * This string is a format template for the delimiter matching regex + * + * It does two things + * 1. Match the provided delimiter by positive lookahead + * 2. Match the provided delimiter by positive lookbehind + * + * Thus, it will only match in places where the delimiter is, but won't + * actually match the delimiter, leaving split to put it into the stream + */ + private static String WITH_DELIM = "((?<=%1$s)|(?=%1$s))"; + + /* + * This string is a format template for the multi-delimiter matching + * regex. + * + * It does the same thing as the single delimiter regex, but has to have + * some negative lookahead/lookbehind assertions to avoid splitting a + * delimiter into pieces. + */ + private static String WITH_MULTI_DELIM = "((?<=%1$s+)(?!%1$s)|(?<!%1$s)(?=%1$s+))"; + + private StringBuilder currPatt; + + private Pattern compPatt; + + /** + * Create a new token splitter. + */ + public NeoTokenSplitter() { + } + + /** + * Split a provided string using configured delimiters, and keeping the + * delimiters. + * + * The splitter must be compiled first. + * + * @param inp The string to split. + * + * @return The split string, including delimiters. + * + * @throws IllegalStateException If the splitter isn't compiled. + */ + public String[] split(String inp) { + if(compPatt == null) { + throw new IllegalStateException("Token splitter has not been compiled yet"); + } + + return compPatt.split(inp); + } + + /** + * Adds a string as a matched delimiter to split on. + * + * Only works for fixed length delimiters. + * + * The provided string is regex-escaped before being used. + * + * @param delim The delimiter to match on. + */ + public void addDelimiter(String delim) { + String delimPat = String.format(WITH_DELIM, Pattern.quote(delim)); + + if(currPatt == null) { + currPatt = new StringBuilder(); + + currPatt.append("(?:" + delimPat + ")"); + } else { + currPatt.append("|(?:" + delimPat + ")"); + } + } + + /** + * Adds a character class as a matched delimiter to split on. + * + * The provided string should be a pattern to match one or more + * occurances of. + * + * @param delim The delimiter to split on. + */ + public void addMultiDelimiter(String delim) { + String delimPat = String.format(WITH_MULTI_DELIM, "(?:" + delim + ")"); + + if(currPatt == null) { + currPatt = new StringBuilder(); + + currPatt.append("(?:" + delimPat + ")"); + } else { + currPatt.append("|(?:" + delimPat + ")"); + } + } + + /** + * Compiles the current set of delimiters to a pattern. + * + * Makes this splitter ready to use. + */ + public void compile() { + compPatt = Pattern.compile(currPatt.toString()); + } +} diff --git a/BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java b/BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java index 6770df2..718514c 100644 --- a/BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java +++ b/BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java @@ -1,6 +1,10 @@ package bjc.utils.funcutils; import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * Utility methods for operations on strings @@ -24,8 +28,8 @@ public class StringUtils { // Bit annoying to have to use a full class name, but what are you // going to do? return org.apache.commons.lang3.StringUtils.countMatches(expression, operator) == 1 - && !expression.equalsIgnoreCase(operator) - && !expression.startsWith(operator); + && !expression.equalsIgnoreCase(operator) + && !expression.startsWith(operator); } /** @@ -40,6 +44,12 @@ public class StringUtils { * of the provided regex */ public static boolean containsOnly(String input, String regex) { + if (input == null) { + throw new NullPointerException("Input must not be null"); + } else if (regex == null) { + throw new NullPointerException("Regex must not be null"); + } + /* * This regular expression is fairly simple. * @@ -48,12 +58,6 @@ public class StringUtils { * group is then matched one or more times and the pattern matches * to the end of the string */ - if (input == null) { - throw new NullPointerException("Input must not be null"); - } else if (regex == null) { - throw new NullPointerException("Regex must not be null"); - } - return input.matches("\\A(?:" + regex + ")+\\Z"); } @@ -85,4 +89,119 @@ public class StringUtils { public static <ContainedType> String printDeque(Deque<ContainedType> queue) { return queue.isEmpty() ? "(none)" : queue.toString(); } + + /* + * This regex matches java-style string escapes + */ + private static String escapeString = + "\\\\([btnfr\"'\\\\]" // Match shortform escape sequences like \t or \" + + "|[0-3]?[0-7]{1,2}" // Match octal escape sequences + + "|u[0-9a-fA-F]{4})"; // Match unicode escape sequences + private static Pattern escapePatt = Pattern.compile(escapeString); + + /* + * This regular expression matches java style double quoted strings + */ + private static Pattern doubleQuotePatt = Pattern.compile("(\"(" + + "[^\\\\\"]+" // Match one or more characters that aren't quotes or slashes + + "|" + escapeString + ")" // Match escape sequences + + "*\")"); // Match all of those things zero or more times, followed by a closing quote + + /** + * Remove double quoted strings from a string. + * + * Splits a string around instances of java-style double-quoted strings. + * + * @param inp The string to split. + * + * @return An list containing alternating bits of the string and the + * embedded double-quoted strings that seperated them. + */ + public static List<String> removeDQuotedStrings(String inp) { + StringBuffer work = new StringBuffer(); + List<String> res = new LinkedList<>(); + + Matcher mt = doubleQuotePatt.matcher(inp); + + while(mt.find()) { + mt.appendReplacement(work, ""); + + res.add(work.toString()); + res.add(mt.group(1)); + + work = new StringBuffer(); + } + mt.appendTail(work); + res.add(work.toString()); + + return res; + } + + /** + * Replace escape characters with their actual equivalents. + * + * @param inp The string to replace escape sequences in. + * + * @return The string with escape sequences replaced by their equivalent + * characters. + */ + public static String descapeString(String inp) { + StringBuffer work = new StringBuffer(); + + Matcher escapeFinder = escapePatt.matcher(inp); + while(escapeFinder.find()) { + String escapeSeq = escapeFinder.group(); + + String escapeRep = ""; + switch(escapeSeq) { + case "\\b": + escapeRep = "\b"; + break; + case "\\t": + escapeRep = "\t"; + break; + case "\\n": + escapeRep = "\n"; + break; + case "\\f": + escapeRep = "\f"; + break; + case "\\r": + escapeRep = "\r"; + break; + case "\\\"": + escapeRep = "\""; + break; + case "\\'": + escapeRep = "'"; + break; + case "\\\\": + escapeRep = "\\"; + break; + default: + if(escapeSeq.startsWith("u")) { + escapeRep = handleUnicodeEscape(escapeSeq.substring(1)); + } else { + escapeRep = handleOctalEscape(escapeSeq); + } + } + + escapeFinder.appendReplacement(work, escapeRep); + } + escapeFinder.appendTail(work); + + return work.toString(); + } + + private static String handleUnicodeEscape(String seq) { + int codepoint = Integer.parseInt(seq, 16); + + return new String(Character.toChars(codepoint)); + } + + private static String handleOctalEscape(String seq) { + int codepoint = Integer.parseInt(seq, 8); + + return new String(Character.toChars(codepoint)); + } } |
