summaryrefslogtreecommitdiff
path: root/BJC-Utils2/src/main/java/bjc/utils
diff options
context:
space:
mode:
Diffstat (limited to 'BJC-Utils2/src/main/java/bjc/utils')
-rw-r--r--BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java112
-rw-r--r--BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java135
2 files changed, 239 insertions, 8 deletions
diff --git a/BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java
new file mode 100644
index 0000000..fd4b130
--- /dev/null
+++ b/BJC-Utils2/src/main/java/bjc/utils/funcutils/NeoTokenSplitter.java
@@ -0,0 +1,112 @@
+package bjc.utils.funcutils;
+
+import java.util.regex.Pattern;
+
+/**
+ * Split a string and keep given delimiters.
+ *
+ * @author Ben Culkin
+ */
+public class NeoTokenSplitter {
+ /*
+ * This string is a format template for the delimiter matching regex
+ *
+ * It does two things
+ * 1. Match the provided delimiter by positive lookahead
+ * 2. Match the provided delimiter by positive lookbehind
+ *
+ * Thus, it will only match in places where the delimiter is, but won't
+ * actually match the delimiter, leaving split to put it into the stream
+ */
+ private static String WITH_DELIM = "((?<=%1$s)|(?=%1$s))";
+
+ /*
+ * This string is a format template for the multi-delimiter matching
+ * regex.
+ *
+ * It does the same thing as the single delimiter regex, but has to have
+ * some negative lookahead/lookbehind assertions to avoid splitting a
+ * delimiter into pieces.
+ */
+ private static String WITH_MULTI_DELIM = "((?<=%1$s+)(?!%1$s)|(?<!%1$s)(?=%1$s+))";
+
+ private StringBuilder currPatt;
+
+ private Pattern compPatt;
+
+ /**
+ * Create a new token splitter.
+ */
+ public NeoTokenSplitter() {
+ }
+
+ /**
+ * Split a provided string using configured delimiters, and keeping the
+ * delimiters.
+ *
+ * The splitter must be compiled first.
+ *
+ * @param inp The string to split.
+ *
+ * @return The split string, including delimiters.
+ *
+ * @throws IllegalStateException If the splitter isn't compiled.
+ */
+ public String[] split(String inp) {
+ if(compPatt == null) {
+ throw new IllegalStateException("Token splitter has not been compiled yet");
+ }
+
+ return compPatt.split(inp);
+ }
+
+ /**
+ * Adds a string as a matched delimiter to split on.
+ *
+ * Only works for fixed length delimiters.
+ *
+ * The provided string is regex-escaped before being used.
+ *
+ * @param delim The delimiter to match on.
+ */
+ public void addDelimiter(String delim) {
+ String delimPat = String.format(WITH_DELIM, Pattern.quote(delim));
+
+ if(currPatt == null) {
+ currPatt = new StringBuilder();
+
+ currPatt.append("(?:" + delimPat + ")");
+ } else {
+ currPatt.append("|(?:" + delimPat + ")");
+ }
+ }
+
+ /**
+ * Adds a character class as a matched delimiter to split on.
+ *
+ * The provided string should be a pattern to match one or more
+ * occurances of.
+ *
+ * @param delim The delimiter to split on.
+ */
+ public void addMultiDelimiter(String delim) {
+ String delimPat = String.format(WITH_MULTI_DELIM, "(?:" + delim + ")");
+
+ if(currPatt == null) {
+ currPatt = new StringBuilder();
+
+ currPatt.append("(?:" + delimPat + ")");
+ } else {
+ currPatt.append("|(?:" + delimPat + ")");
+ }
+ }
+
+ /**
+ * Compiles the current set of delimiters to a pattern.
+ *
+ * Makes this splitter ready to use.
+ */
+ public void compile() {
+ compPatt = Pattern.compile(currPatt.toString());
+ }
+}
diff --git a/BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java b/BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java
index 6770df2..718514c 100644
--- a/BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java
+++ b/BJC-Utils2/src/main/java/bjc/utils/funcutils/StringUtils.java
@@ -1,6 +1,10 @@
package bjc.utils.funcutils;
import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
/**
* Utility methods for operations on strings
@@ -24,8 +28,8 @@ public class StringUtils {
// Bit annoying to have to use a full class name, but what are you
// going to do?
return org.apache.commons.lang3.StringUtils.countMatches(expression, operator) == 1
- && !expression.equalsIgnoreCase(operator)
- && !expression.startsWith(operator);
+ && !expression.equalsIgnoreCase(operator)
+ && !expression.startsWith(operator);
}
/**
@@ -40,6 +44,12 @@ public class StringUtils {
* of the provided regex
*/
public static boolean containsOnly(String input, String regex) {
+ if (input == null) {
+ throw new NullPointerException("Input must not be null");
+ } else if (regex == null) {
+ throw new NullPointerException("Regex must not be null");
+ }
+
/*
* This regular expression is fairly simple.
*
@@ -48,12 +58,6 @@ public class StringUtils {
* group is then matched one or more times and the pattern matches
* to the end of the string
*/
- if (input == null) {
- throw new NullPointerException("Input must not be null");
- } else if (regex == null) {
- throw new NullPointerException("Regex must not be null");
- }
-
return input.matches("\\A(?:" + regex + ")+\\Z");
}
@@ -85,4 +89,119 @@ public class StringUtils {
public static <ContainedType> String printDeque(Deque<ContainedType> queue) {
return queue.isEmpty() ? "(none)" : queue.toString();
}
+
+ /*
+ * This regex matches java-style string escapes
+ */
+ private static String escapeString =
+ "\\\\([btnfr\"'\\\\]" // Match shortform escape sequences like \t or \"
+ + "|[0-3]?[0-7]{1,2}" // Match octal escape sequences
+ + "|u[0-9a-fA-F]{4})"; // Match unicode escape sequences
+ private static Pattern escapePatt = Pattern.compile(escapeString);
+
+ /*
+ * This regular expression matches java style double quoted strings
+ */
+ private static Pattern doubleQuotePatt = Pattern.compile("(\"("
+ + "[^\\\\\"]+" // Match one or more characters that aren't quotes or slashes
+ + "|" + escapeString + ")" // Match escape sequences
+ + "*\")"); // Match all of those things zero or more times, followed by a closing quote
+
+ /**
+ * Remove double quoted strings from a string.
+ *
+ * Splits a string around instances of java-style double-quoted strings.
+ *
+ * @param inp The string to split.
+ *
+ * @return An list containing alternating bits of the string and the
+ * embedded double-quoted strings that seperated them.
+ */
+ public static List<String> removeDQuotedStrings(String inp) {
+ StringBuffer work = new StringBuffer();
+ List<String> res = new LinkedList<>();
+
+ Matcher mt = doubleQuotePatt.matcher(inp);
+
+ while(mt.find()) {
+ mt.appendReplacement(work, "");
+
+ res.add(work.toString());
+ res.add(mt.group(1));
+
+ work = new StringBuffer();
+ }
+ mt.appendTail(work);
+ res.add(work.toString());
+
+ return res;
+ }
+
+ /**
+ * Replace escape characters with their actual equivalents.
+ *
+ * @param inp The string to replace escape sequences in.
+ *
+ * @return The string with escape sequences replaced by their equivalent
+ * characters.
+ */
+ public static String descapeString(String inp) {
+ StringBuffer work = new StringBuffer();
+
+ Matcher escapeFinder = escapePatt.matcher(inp);
+ while(escapeFinder.find()) {
+ String escapeSeq = escapeFinder.group();
+
+ String escapeRep = "";
+ switch(escapeSeq) {
+ case "\\b":
+ escapeRep = "\b";
+ break;
+ case "\\t":
+ escapeRep = "\t";
+ break;
+ case "\\n":
+ escapeRep = "\n";
+ break;
+ case "\\f":
+ escapeRep = "\f";
+ break;
+ case "\\r":
+ escapeRep = "\r";
+ break;
+ case "\\\"":
+ escapeRep = "\"";
+ break;
+ case "\\'":
+ escapeRep = "'";
+ break;
+ case "\\\\":
+ escapeRep = "\\";
+ break;
+ default:
+ if(escapeSeq.startsWith("u")) {
+ escapeRep = handleUnicodeEscape(escapeSeq.substring(1));
+ } else {
+ escapeRep = handleOctalEscape(escapeSeq);
+ }
+ }
+
+ escapeFinder.appendReplacement(work, escapeRep);
+ }
+ escapeFinder.appendTail(work);
+
+ return work.toString();
+ }
+
+ private static String handleUnicodeEscape(String seq) {
+ int codepoint = Integer.parseInt(seq, 16);
+
+ return new String(Character.toChars(codepoint));
+ }
+
+ private static String handleOctalEscape(String seq) {
+ int codepoint = Integer.parseInt(seq, 8);
+
+ return new String(Character.toChars(codepoint));
+ }
}