From 1948e4ea9288191d8de405c9c78460e6f0457cb9 Mon Sep 17 00:00:00 2001 From: "Benjamin J. Culkin" Date: Mon, 11 Sep 2017 16:14:01 -0300 Subject: Add custom string escaper. StringDescaper is a class that allows for adding additional \ escapes that will be processed in a given string. --- .../java/bjc/utils/parserutils/StringDescaper.java | 242 +++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 BJC-Utils2/src/main/java/bjc/utils/parserutils/StringDescaper.java diff --git a/BJC-Utils2/src/main/java/bjc/utils/parserutils/StringDescaper.java b/BJC-Utils2/src/main/java/bjc/utils/parserutils/StringDescaper.java new file mode 100644 index 0000000..096656a --- /dev/null +++ b/BJC-Utils2/src/main/java/bjc/utils/parserutils/StringDescaper.java @@ -0,0 +1,242 @@ +package bjc.utils.parserutils; + +import java.util.HashMap; +import java.util.Map; +import java.util.function.UnaryOperator; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import static java.util.Map.Entry; + +import static bjc.utils.PropertyDB.applyFormat; +import static bjc.utils.PropertyDB.getCompiledRegex; +import static bjc.utils.PropertyDB.getRegex; + +public class StringDescaper { + private Logger LOGGER = Logger.getLogger(StringDescaper.class.getName()); + + /* + * Patterns and pattern parts. + */ + private static String rPossibleEscapeString = getRegex("possibleStringEscape"); + private static Pattern possibleEscapePatt = Pattern.compile(rPossibleEscapeString); + + private static String rShortEscape = getRegex("shortFormStringEscape"); + private static String rOctalEscape = getRegex("octalStringEscape"); + private static String rUnicodeEscape = getRegex("unicodeStringEscape"); + + private String rEscapeString; + private Pattern escapePatt; + + private static String rDoubleQuoteString = applyFormat("doubleQuotes", getRegex("nonStringEscape"), rPossibleEscapeString); + private static Pattern doubleQuotePatt = Pattern.compile(rDoubleQuoteString); + + private static Pattern quotePatt = getCompiledRegex("unescapedQuote"); + + private Map literalEscapes; + private Map> specialEscapes; + + public StringDescaper() { + literalEscapes = new HashMap<>(); + specialEscapes = new HashMap<>(); + + rEscapeString = String.format("\\\\(%1$s|%2$s|%3$s)"); + escapePatt = Pattern.compile(rEscapeString); + } + + public void addLiteralEscape(String escape, String val) { + if(literalEscapes.containsKey(escape)) { + LOGGER.warning(String.format("Shadowing literal escape '%s'\n", escape)); + } + + literalEscapes.put(escape, val); + } + + public void addSpecialEscape(String escape, UnaryOperator val) { + if(specialEscapes.containsKey(escape)) { + LOGGER.warning(String.format("Shadowing special escape '%s'\n", escape)); + } + + /* + * Make sure this special escape is a valid regex. + */ + + Pattern patt = null; + try { + patt = Pattern.compile(escape); + } catch (PatternSyntaxException psex) { + String msg = String.format("Invalid special escape '%s'", escape); + + IllegalArgumentException iaex = new IllegalArgumentException(msg); + iaex.initCause(psex); + + throw psex; + } + + specialEscapes.put(patt, val); + } + + public void compileEscapes() { + StringBuilder work = new StringBuilder(); + + for(String litEscape : literalEscapes.keySet()) { + work.append("|(?:"); + work.append(Pattern.quote(litEscape)); + work.append(")"); + } + + for(Pattern specEscape : specialEscapes.keySet()) { + work.append("|(?:"); + work.append(specEscape.toString()); + work.append(")"); + } + + /* + * Convert user-defined escapes to a regex for matching. + * We don't need a bar before %4 because the string has it. + */ + rEscapeString = String.format("\\(%1$s|%2$s|%3$s%4$s)", rShortEscape, rOctalEscape, rUnicodeEscape, work.toString()); + escapePatt = Pattern.compile(rEscapeString); + } + + /** + * Replace escape characters with their actual equivalents. + * + * @param inp + * The string to replace escape sequences in. + * + * @return The string with escape sequences replaced by their equivalent + * characters. + */ + public String descapeString(final String inp) { + if (inp == null) { + throw new NullPointerException("Input to descapeString must not be null"); + } + + /* + * Prepare the buffer and escape finder. + */ + final StringBuffer work = new StringBuffer(); + final Matcher possibleEscapeFinder = possibleEscapePatt.matcher(inp); + final Matcher escapeFinder = escapePatt.matcher(inp); + + while (possibleEscapeFinder.find()) { + if (!escapeFinder.find()) { + /* + * Found a possible escape that isn't actually an + * escape. + */ + final String msg = String.format("Illegal escape sequence '%s' at position %d of string '%s'", + possibleEscapeFinder.group(), possibleEscapeFinder.start(), inp); + throw new IllegalArgumentException(msg); + } + + final String escapeSeq = escapeFinder.group(); + + /* + * Convert the escape to a string. + */ + String escapeRep = ""; + switch (escapeSeq) { + case "\\b": + escapeRep = "\b"; + break; + case "\\t": + escapeRep = "\t"; + break; + case "\\n": + escapeRep = "\n"; + break; + case "\\f": + escapeRep = "\f"; + break; + case "\\r": + escapeRep = "\r"; + break; + case "\\\"": + escapeRep = "\""; + break; + case "\\'": + escapeRep = "'"; + break; + case "\\\\": + /* + * Skip past the second slash. + */ + possibleEscapeFinder.find(); + escapeRep = "\\"; + break; + default: + if (escapeSeq.startsWith("u")) { + escapeRep = handleUnicodeEscape(escapeSeq.substring(1)); + } else if(escapeSeq.startsWith("O")) { + escapeRep = handleOctalEscape(escapeSeq.substring(1)); + } else if(literalEscapes.containsKey(escapeSeq)) { + escapeRep = literalEscapes.get(escapeSeq); + } else { + for(Entry> ent : specialEscapes.entrySet()) { + Pattern pat = ent.getKey(); + + Matcher mat = pat.matcher(escapeSeq); + if(mat.matches()) { + escapeRep = ent.getValue().apply(escapeSeq); + break; + } + } + } + } + + escapeFinder.appendReplacement(work, escapeRep); + } + + escapeFinder.appendTail(work); + + return work.toString(); + } + + /* + * Handle a unicode codepoint. + */ + private static String handleUnicodeEscape(final String seq) { + try { + final int codepoint = Integer.parseInt(seq, 16); + + return new String(Character.toChars(codepoint)); + } catch (final IllegalArgumentException iaex) { + final String msg = String.format("'%s' is not a valid Unicode escape sequence'", seq); + + final IllegalArgumentException reiaex = new IllegalArgumentException(msg); + + reiaex.initCause(iaex); + + throw reiaex; + } + } + + /* + * Handle a octal codepoint. + */ + private static String handleOctalEscape(final String seq) { + try { + final int codepoint = Integer.parseInt(seq, 8); + + if (codepoint > 255) { + final String msg = String.format("'%d' is outside the range of octal escapes', codepoint"); + + throw new IllegalArgumentException(msg); + } + + return new String(Character.toChars(codepoint)); + } catch (final IllegalArgumentException iaex) { + final String msg = String.format("'%s' is not a valid octal escape sequence'", seq); + + final IllegalArgumentException reiaex = new IllegalArgumentException(msg); + + reiaex.initCause(iaex); + + throw reiaex; + } + } +} -- cgit v1.2.3