From 355b4d1dda5965ea9b58bd2c80e3703a55abce98 Mon Sep 17 00:00:00 2001 From: bculkin2442 Date: Fri, 10 Mar 2017 08:46:10 -0500 Subject: String manipulation additions More and better ways to manipulate strings --- .../java/bjc/utils/funcutils/NeoTokenSplitter.java | 112 +++++++++++++++++ .../main/java/bjc/utils/funcutils/StringUtils.java | 135 +++++++++++++++++++-- 2 files changed, 239 insertions(+), 8 deletions(-) create mode 100644 BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java (limited to 'BJC-Utils2/src/main') diff --git a/BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java new file mode 100644 index 0000000..fd4b130 --- /dev/null +++ b/BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java @@ -0,0 +1,112 @@ +package bjc.utils.funcutils; + +import java.util.regex.Pattern; + +/** + * Split a string and keep given delimiters. + * + * @author Ben Culkin + */ +public class NeoTokenSplitter { + /* + * This string is a format template for the delimiter matching regex + * + * It does two things + * 1. Match the provided delimiter by positive lookahead + * 2. Match the provided delimiter by positive lookbehind + * + * Thus, it will only match in places where the delimiter is, but won't + * actually match the delimiter, leaving split to put it into the stream + */ + private static String WITH_DELIM = "((?<=%1$s)|(?=%1$s))"; + + /* + * This string is a format template for the multi-delimiter matching + * regex. + * + * It does the same thing as the single delimiter regex, but has to have + * some negative lookahead/lookbehind assertions to avoid splitting a + * delimiter into pieces. + */ + private static String WITH_MULTI_DELIM = "((?<=%1$s+)(?!%1$s)|(? String printDeque(Deque queue) { return queue.isEmpty() ? "(none)" : queue.toString(); } + + /* + * This regex matches java-style string escapes + */ + private static String escapeString = + "\\\\([btnfr\"'\\\\]" // Match shortform escape sequences like \t or \" + + "|[0-3]?[0-7]{1,2}" // Match octal escape sequences + + "|u[0-9a-fA-F]{4})"; // Match unicode escape sequences + private static Pattern escapePatt = Pattern.compile(escapeString); + + /* + * This regular expression matches java style double quoted strings + */ + private static Pattern doubleQuotePatt = Pattern.compile("(\"(" + + "[^\\\\\"]+" // Match one or more characters that aren't quotes or slashes + + "|" + escapeString + ")" // Match escape sequences + + "*\")"); // Match all of those things zero or more times, followed by a closing quote + + /** + * Remove double quoted strings from a string. + * + * Splits a string around instances of java-style double-quoted strings. + * + * @param inp The string to split. + * + * @return An list containing alternating bits of the string and the + * embedded double-quoted strings that seperated them. + */ + public static List removeDQuotedStrings(String inp) { + StringBuffer work = new StringBuffer(); + List res = new LinkedList<>(); + + Matcher mt = doubleQuotePatt.matcher(inp); + + while(mt.find()) { + mt.appendReplacement(work, ""); + + res.add(work.toString()); + res.add(mt.group(1)); + + work = new StringBuffer(); + } + mt.appendTail(work); + res.add(work.toString()); + + return res; + } + + /** + * Replace escape characters with their actual equivalents. + * + * @param inp The string to replace escape sequences in. + * + * @return The string with escape sequences replaced by their equivalent + * characters. + */ + public static String descapeString(String inp) { + StringBuffer work = new StringBuffer(); + + Matcher escapeFinder = escapePatt.matcher(inp); + while(escapeFinder.find()) { + String escapeSeq = escapeFinder.group(); + + String escapeRep = ""; + switch(escapeSeq) { + case "\\b": + escapeRep = "\b"; + break; + case "\\t": + escapeRep = "\t"; + break; + case "\\n": + escapeRep = "\n"; + break; + case "\\f": + escapeRep = "\f"; + break; + case "\\r": + escapeRep = "\r"; + break; + case "\\\"": + escapeRep = "\""; + break; + case "\\'": + escapeRep = "'"; + break; + case "\\\\": + escapeRep = "\\"; + break; + default: + if(escapeSeq.startsWith("u")) { + escapeRep = handleUnicodeEscape(escapeSeq.substring(1)); + } else { + escapeRep = handleOctalEscape(escapeSeq); + } + } + + escapeFinder.appendReplacement(work, escapeRep); + } + escapeFinder.appendTail(work); + + return work.toString(); + } + + private static String handleUnicodeEscape(String seq) { + int codepoint = Integer.parseInt(seq, 16); + + return new String(Character.toChars(codepoint)); + } + + private static String handleOctalEscape(String seq) { + int codepoint = Integer.parseInt(seq, 8); + + return new String(Character.toChars(codepoint)); + } } -- cgit v1.2.3