summaryrefslogtreecommitdiff
path: root/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter
diff options
context:
space:
mode:
Diffstat (limited to 'BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter')
-rw-r--r--BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/SimpleTokenSplitter.java235
-rw-r--r--BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TokenSplitter.java26
-rw-r--r--BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TwoLevelSplitter.java110
3 files changed, 371 insertions, 0 deletions
diff --git a/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/SimpleTokenSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/SimpleTokenSplitter.java
new file mode 100644
index 0000000..8b078a9
--- /dev/null
+++ b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/SimpleTokenSplitter.java
@@ -0,0 +1,235 @@
+package bjc.utils.parserutils.splitter;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+/**
+ * Simple implementation of {@link TokenSplitter}
+ *
+ * @author EVE
+ *
+ */
+public class SimpleTokenSplitter implements TokenSplitter {
+ /*
+ * This string is a format template for the delimiter matching regex
+ *
+ * It does two things:
+ *
+ * <ol> <li> Match to the left of the provided delimiter by positive
+ * lookahead </li> <li> Match to the right of the provided delimiter by
+ * positive lookbehind </li> </ol>
+ *
+ * Thus, it will only match in places where the delimiter is, but won't
+ * actually match the delimiter, leaving split to put it into the stream
+ */
+ private static String WITH_DELIM = "(?:(?<=%1$s)|(?=%1$s))";
+
+ /*
+ * This string is a format template for the multi-delimiter matching
+ * regex.
+ *
+ * It does the same thing as the single delimiter regex, but has to have
+ * some negative lookahead/lookbehind assertions to avoid splitting a
+ * delimiter into pieces.
+ */
+ private static String WITH_MULTI_DELIM = "(?:(?<=%1$s+)(?!%1$s)|(?<!%1$s)(?=%1$s+))";
+
+ /*
+ * These represent the internal state of the splitter.
+ */
+ private StringBuilder currPatt;
+ private StringBuilder currExclusionPatt;
+
+ /*
+ * These represent the external state of the splitter.
+ *
+ * Compilation causes internal to become external.
+ */
+ private Pattern compPatt;
+ private Pattern exclusionPatt;
+
+ /*
+ * These represent info for debugging.
+ */
+ private Set<String> delimSet;
+ private Set<String> multidelimSet;
+ private Set<String> exclusionSet;
+
+ /**
+ * Create a new token splitter.
+ */
+ public SimpleTokenSplitter() {
+ delimSet = new HashSet<>();
+ multidelimSet = new HashSet<>();
+ exclusionSet = new HashSet<>();
+ }
+
+ @Override
+ public String[] split(String inp) {
+ if(compPatt == null) throw new IllegalStateException("Token splitter has not been compiled yet");
+
+ /*
+ * Don't split something that we should exclude from being
+ * split.
+ */
+ if(exclusionPatt.matcher(inp).matches()) return new String[] { inp };
+
+ return compPatt.split(inp);
+ }
+
+ /**
+ * Adds one or more strings as matched delimiters to split on.
+ *
+ * Only works for fixed length delimiters.
+ *
+ * The provided strings are regex-escaped before being used.
+ *
+ * @param delims
+ * The delimiters to match on.
+ */
+ public void addDelimiter(String... delims) {
+ for(String delim : delims) {
+ if(delim == null) throw new NullPointerException("Delim must not be null");
+
+ String quoteDelim = Pattern.quote(delim);
+ String delimPat = String.format(WITH_DELIM, quoteDelim);
+
+ if(currPatt == null) {
+ currPatt = new StringBuilder();
+ currExclusionPatt = new StringBuilder();
+
+ currPatt.append("(?:" + delimPat + ")");
+ currExclusionPatt.append("(?:" + quoteDelim + ")");
+ } else {
+ currPatt.append("|(?:" + delimPat + ")");
+ currExclusionPatt.append("|(?:" + quoteDelim + ")");
+ }
+
+ delimSet.add(delim);
+ }
+ }
+
+ /**
+ * Adds a character class as a matched delimiter to split on.
+ *
+ * The provided string should be a pattern to match one or more
+ * occurances of.
+ *
+ * @param delims
+ * The delimiter to split on.
+ */
+ public void addMultiDelimiter(String... delims) {
+ for(String delim : delims) {
+ if(delim == null) throw new NullPointerException("Delim must not be null");
+
+ String delimPat = String.format(WITH_MULTI_DELIM, "(?:" + delim + ")");
+
+ if(currPatt == null) {
+ currPatt = new StringBuilder();
+ currExclusionPatt = new StringBuilder();
+
+ currPatt.append("(?:" + delimPat + ")");
+ currExclusionPatt.append("(?:(?:" + delim + ")+)");
+
+ } else {
+ currPatt.append("|(?:" + delimPat + ")");
+ currExclusionPatt.append("|(?:(?:" + delim + ")+)");
+ }
+
+ multidelimSet.add(delim);
+ }
+ }
+
+ /**
+ * Marks strings matching the pattern delim as non-splittable.
+ *
+ * @param delims
+ * The regex to not splitting matching strings.
+ */
+ public void addNonMatcher(String... delims) {
+ for(String delim : delims) {
+ if(delim == null) throw new NullPointerException("Delim must not be null");
+
+ if(currPatt == null) {
+ currPatt = new StringBuilder();
+ currExclusionPatt = new StringBuilder();
+
+ currExclusionPatt.append("(?:" + delim + ")");
+ } else {
+ currExclusionPatt.append("|(?:" + delim + ")");
+ }
+
+ exclusionSet.add(delim);
+ }
+ }
+
+ /**
+ * Compiles the current set of delimiters to a pattern.
+ *
+ * Makes this splitter ready to use.
+ */
+ public void compile() {
+ if(currPatt == null) currPatt = new StringBuilder();
+ if(currExclusionPatt == null) currExclusionPatt = new StringBuilder();
+
+ compPatt = Pattern.compile(currPatt.toString());
+ exclusionPatt = Pattern.compile(currExclusionPatt.toString());
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+
+ builder.append("SimpleTokenSplitter [");
+
+ if(currPatt != null) {
+ builder.append("currPatt=");
+ builder.append(currPatt);
+ builder.append("\n\t, ");
+ }
+
+ if(currExclusionPatt != null) {
+ builder.append("currExclusionPatt=");
+ builder.append(currExclusionPatt);
+ builder.append("\n\t, ");
+ }
+
+ if(compPatt != null) {
+ builder.append("compPatt=");
+ builder.append(compPatt);
+ builder.append("\n\t, ");
+ }
+
+ if(exclusionPatt != null) {
+ builder.append("exclusionPatt=");
+ builder.append(exclusionPatt);
+ builder.append("\n\t, ");
+ }
+
+ if(delimSet != null) {
+ builder.append("delimSet=");
+ builder.append(delimSet);
+ builder.append("\n\t, ");
+ }
+
+ if(multidelimSet != null) {
+ builder.append("multidelimSet=");
+ builder.append(multidelimSet);
+ builder.append("\n\t, ");
+ }
+
+ if(exclusionSet != null) {
+ builder.append("exclusionSet=");
+ builder.append(exclusionSet);
+ }
+
+ builder.append("]");
+ return builder.toString();
+ }
+}
diff --git a/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TokenSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TokenSplitter.java
new file mode 100644
index 0000000..e59d88e
--- /dev/null
+++ b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TokenSplitter.java
@@ -0,0 +1,26 @@
+package bjc.utils.parserutils.splitter;
+
+/**
+ * Split a string and keep given delimiters.
+ *
+ * @author Ben Culkin
+ */
+public interface TokenSplitter {
+ /**
+ * Split a provided string using configured delimiters, and keeping the
+ * delimiters.
+ *
+ * <p>
+ * The splitter must be compiled first.
+ * </p>
+ *
+ * @param inp
+ * The string to split.
+ *
+ * @return The split string, including delimiters.
+ *
+ * @throws IllegalStateException
+ * If the splitter isn't compiled.
+ */
+ String[] split(String inp);
+} \ No newline at end of file
diff --git a/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TwoLevelSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TwoLevelSplitter.java
new file mode 100644
index 0000000..38f303d
--- /dev/null
+++ b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TwoLevelSplitter.java
@@ -0,0 +1,110 @@
+package bjc.utils.parserutils.splitter;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+/**
+ * Implementation of a splitter that runs in two passes.
+ *
+ * This is useful because {@link SimpleTokenSplitter} doesn't like handling both
+ * <= and = without mangling them.
+ *
+ * The first pass splits on compound operators, which are built up from simple
+ * operators.
+ *
+ * The second pass removes simple operators.
+ *
+ * @author EVE
+ *
+ */
+public class TwoLevelSplitter implements TokenSplitter {
+ private SimpleTokenSplitter high;
+ private SimpleTokenSplitter low;
+
+ /**
+ * Create a new two level splitter.
+ */
+ public TwoLevelSplitter() {
+ high = new SimpleTokenSplitter();
+ low = new SimpleTokenSplitter();
+ }
+
+ @Override
+ public String[] split(String inp) {
+ List<String> ret = new ArrayList<>();
+
+ String[] partials = high.split(inp);
+
+ for(String partial : partials) {
+ String[] finals = low.split(partial);
+
+ for(String fin : finals) {
+ ret.add(fin);
+ }
+ }
+
+ return ret.toArray(new String[ret.size()]);
+ }
+
+ /**
+ * Adds compound operators to split on.
+ *
+ * @param delims
+ * The compound operators to split on.
+ */
+ public void addCompoundDelim(String... delims) {
+ for(String delim : delims) {
+ high.addDelimiter(delim);
+
+ low.addNonMatcher(Pattern.quote(delim));
+ }
+ }
+
+ /**
+ * Adds simple operators to split on.
+ *
+ * @param delims
+ * The simple operators to split on.
+ */
+ public void addSimpleDelim(String... delims) {
+ for(String delim : delims) {
+ low.addDelimiter(delim);
+ }
+ }
+
+ /**
+ * Adds repeated compound operators to split on.
+ *
+ * @param delims
+ * The repeated compound operators to split on.
+ */
+ public void addCompoundMulti(String... delims) {
+ for(String delim : delims) {
+ high.addMultiDelimiter(delim);
+
+ low.addNonMatcher("(?:" + delim + ")+");
+ }
+ }
+
+ /**
+ * Adds simple compound operators to split on.
+ *
+ * @param delims
+ * The repeated simple operators to split on.
+ */
+ public void addSimpleMulti(String... delims) {
+ for(String delim : delims) {
+ low.addMultiDelimiter(delim);
+ }
+ }
+
+ /**
+ * Ready the splitter for use.
+ */
+ public void compile() {
+ high.compile();
+
+ low.compile();
+ }
+}