From 355b4d1dda5965ea9b58bd2c80e3703a55abce98 Mon Sep 17 00:00:00 2001
From: bculkin2442 <bjculkin@mix.wvu.edu>
Date: Fri, 10 Mar 2017 08:46:10 -0500
Subject: String manipulation additions

More and better ways to manipulate strings
---
 .../java/bjc/utils/funcutils/NeoTokenSplitter.java | 112 +++++++++++++++++
 .../main/java/bjc/utils/funcutils/StringUtils.java | 135 +++++++++++++++++++--
 2 files changed, 239 insertions(+), 8 deletions(-)
 create mode 100644 BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java

(limited to 'BJC-Utils2/src/main')

diff --git a/BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java
new file mode 100644
index 0000000..fd4b130
--- /dev/null
+++ b/BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java
@@ -0,0 +1,112 @@
+package bjc.utils.funcutils;
+
+import java.util.regex.Pattern;
+
+/**
+ * Split a string and keep given delimiters.
+ *
+ * @author Ben Culkin
+ */
+public class NeoTokenSplitter {
+	/*
+	 * This string is a format template for the delimiter matching regex
+	 *
+	 * It does two things
+	 * 1. Match the provided delimiter by positive lookahead
+	 * 2. Match the provided delimiter by positive lookbehind
+	 *
+	 * Thus, it will only match in places where the delimiter is, but won't
+	 * actually match the delimiter, leaving split to put it into the stream
+	 */
+	private static String WITH_DELIM = "((?<=%1$s)|(?=%1$s))";
+
+	/*
+	 * This string is a format template for the multi-delimiter matching
+	 * regex.
+	 *
+	 * It does the same thing as the single delimiter regex, but has to have
+	 * some negative lookahead/lookbehind assertions to avoid splitting a
+	 * delimiter into pieces.
+	 */
+	private static String WITH_MULTI_DELIM = "((?<=%1$s+)(?!%1$s)|(?<!%1$s)(?=%1$s+))";
+
+	private StringBuilder currPatt;
+
+	private Pattern compPatt;
+
+	/**
+	 * Create a new token splitter.
+	 */
+	public NeoTokenSplitter() {
+	}
+
+	/**
+	 * Split a provided string using configured delimiters, and keeping the
+	 * delimiters.
+	 *
+	 * The splitter must be compiled first.
+	 *
+	 * @param inp The string to split.
+	 *
+	 * @return The split string, including delimiters.
+	 *
+	 * @throws IllegalStateException If the splitter isn't compiled.
+	 */
+	public String[] split(String inp) {
+		if(compPatt == null) {
+			throw new IllegalStateException("Token splitter has not been compiled yet");
+		}
+
+		return compPatt.split(inp);
+	}
+
+	/**
+	 * Adds a string as a matched delimiter to split on.
+	 *
+	 * Only works for fixed length delimiters.
+	 *
+	 * The provided string is regex-escaped before being used.
+	 *
+	 * @param delim The delimiter to match on.
+	 */
+	public void addDelimiter(String delim) {
+		String delimPat = String.format(WITH_DELIM, Pattern.quote(delim));
+
+		if(currPatt == null) {
+			currPatt = new StringBuilder();
+
+			currPatt.append("(?:" + delimPat + ")");
+		} else {
+			currPatt.append("|(?:" + delimPat + ")");
+		}
+	}
+
+	/**
+	 * Adds a character class as a matched delimiter to split on.
+	 *
+	 * The provided string should be a pattern to match one or more
+	 * occurances of.
+	 *
+	 * @param delim The delimiter to split on.
+	 */
+	public void addMultiDelimiter(String delim) {
+		String delimPat = String.format(WITH_MULTI_DELIM, "(?:" + delim + ")");
+
+		if(currPatt == null) {
+			currPatt = new StringBuilder();
+
+			currPatt.append("(?:" + delimPat + ")");
+		} else {
+			currPatt.append("|(?:" + delimPat + ")");
+		}
+	}
+
+	/**
+	 * Compiles the current set of delimiters to a pattern.
+	 *
+	 * Makes this splitter ready to use.
+	 */
+	public void compile() {
+		compPatt = Pattern.compile(currPatt.toString());
+	}
+}
diff --git a/BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java b/BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java
index 6770df2..718514c 100644
--- a/BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java
+++ b/BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java
@@ -1,6 +1,10 @@
 package bjc.utils.funcutils;
 
 import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 /**
  * Utility methods for operations on strings
@@ -24,8 +28,8 @@ public class StringUtils {
 		// Bit annoying to have to use a full class name, but what are you
 		// going to do?
 		return org.apache.commons.lang3.StringUtils.countMatches(expression, operator) == 1
-				&& !expression.equalsIgnoreCase(operator)
-				&& !expression.startsWith(operator);
+			&& !expression.equalsIgnoreCase(operator)
+			&& !expression.startsWith(operator);
 	}
 
 	/**
@@ -40,6 +44,12 @@ public class StringUtils {
 	 *         of the provided regex
 	 */
 	public static boolean containsOnly(String input, String regex) {
+		if (input == null) {
+			throw new NullPointerException("Input must not be null");
+		} else if (regex == null) {
+			throw new NullPointerException("Regex must not be null");
+		}
+
 		/*
 		 * This regular expression is fairly simple.
 		 * 
@@ -48,12 +58,6 @@ public class StringUtils {
 		 * group is then matched one or more times and the pattern matches
 		 * to the end of the string
 		 */
-		if (input == null) {
-			throw new NullPointerException("Input must not be null");
-		} else if (regex == null) {
-			throw new NullPointerException("Regex must not be null");
-		}
-
 		return input.matches("\\A(?:" + regex + ")+\\Z");
 	}
 
@@ -85,4 +89,119 @@ public class StringUtils {
 	public static <ContainedType> String printDeque(Deque<ContainedType> queue) {
 		return queue.isEmpty() ? "(none)" : queue.toString();
 	}
+
+	/*
+	 * This regex matches java-style string escapes
+	 */
+	private static String escapeString = 
+		"\\\\([btnfr\"'\\\\]"      // Match shortform escape sequences like \t or \"
+		+ "|[0-3]?[0-7]{1,2}" // Match octal escape sequences
+		+ "|u[0-9a-fA-F]{4})";  // Match unicode escape sequences
+	private static Pattern escapePatt = Pattern.compile(escapeString);
+
+	/*
+	 * This regular expression matches java style double quoted strings
+	 */
+	private static Pattern doubleQuotePatt = Pattern.compile("(\"("
+			+ "[^\\\\\"]+"                  // Match one or more characters that aren't quotes or slashes
+			+ "|" + escapeString + ")" // Match escape sequences
+			+ "*\")");                      // Match all of those things zero or more times, followed by a closing quote
+
+	/**
+	 * Remove double quoted strings from a string.
+	 *
+	 * Splits a string around instances of java-style double-quoted strings.
+	 *
+	 * @param inp The string to split.
+	 *
+	 * @return An list containing alternating bits of the string and the
+	 * embedded double-quoted strings that seperated them.
+	 */
+	public static List<String> removeDQuotedStrings(String inp) {
+		StringBuffer work = new StringBuffer();
+		List<String> res = new LinkedList<>();
+
+		Matcher mt = doubleQuotePatt.matcher(inp);
+
+		while(mt.find()) {
+			mt.appendReplacement(work, "");
+
+			res.add(work.toString());
+			res.add(mt.group(1));
+
+			work = new StringBuffer();
+		}
+		mt.appendTail(work);
+		res.add(work.toString());
+
+		return res;
+	}
+
+	/**
+	 * Replace escape characters with their actual equivalents.
+	 *
+	 * @param inp The string to replace escape sequences in.
+	 *
+	 * @return The string with escape sequences replaced by their equivalent
+	 * characters.
+	 */
+	public static String descapeString(String inp) {
+		StringBuffer work = new StringBuffer();
+
+		Matcher escapeFinder = escapePatt.matcher(inp);
+		while(escapeFinder.find()) {
+			String escapeSeq = escapeFinder.group();
+
+			String escapeRep = "";
+			switch(escapeSeq) {
+			case "\\b":
+				escapeRep = "\b";
+				break;
+			case "\\t":
+				escapeRep = "\t";
+				break;
+			case "\\n":
+				escapeRep = "\n";
+				break;
+			case "\\f":
+				escapeRep = "\f";
+				break;
+			case "\\r":
+				escapeRep = "\r";
+				break;
+			case "\\\"":
+				escapeRep = "\"";
+				break;
+			case "\\'":
+				escapeRep = "'";
+				break;
+			case "\\\\":
+				escapeRep = "\\";
+				break;
+			default:
+				if(escapeSeq.startsWith("u")) {
+					escapeRep = handleUnicodeEscape(escapeSeq.substring(1));
+				} else {
+					escapeRep = handleOctalEscape(escapeSeq);
+				}
+			}
+
+			escapeFinder.appendReplacement(work, escapeRep);
+		}
+		escapeFinder.appendTail(work);
+
+		return work.toString();
+	}
+
+	private static String handleUnicodeEscape(String seq) {
+		int codepoint = Integer.parseInt(seq, 16);
+
+		return new String(Character.toChars(codepoint));
+	}
+
+	private static String handleOctalEscape(String seq) {
+		int codepoint = Integer.parseInt(seq, 8);
+
+		return new String(Character.toChars(codepoint));
+	}
 }
-- 
cgit v1.2.3