From 0f6565687e03968abd2e508fa8183f50f04f1cc7 Mon Sep 17 00:00:00 2001 From: bjculkin Date: Fri, 24 Mar 2017 16:21:07 -0400 Subject: Update Pratt Parser --- .../parserutils/splitter/SimpleTokenSplitter.java | 235 +++++++++++++++++++++ .../utils/parserutils/splitter/TokenSplitter.java | 26 +++ .../parserutils/splitter/TwoLevelSplitter.java | 110 ++++++++++ 3 files changed, 371 insertions(+) create mode 100644 BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/SimpleTokenSplitter.java create mode 100644 BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TokenSplitter.java create mode 100644 BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TwoLevelSplitter.java (limited to 'BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter') diff --git a/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/SimpleTokenSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/SimpleTokenSplitter.java new file mode 100644 index 0000000..8b078a9 --- /dev/null +++ b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/SimpleTokenSplitter.java @@ -0,0 +1,235 @@ +package bjc.utils.parserutils.splitter; + +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Pattern; + +/** + * Simple implementation of {@link TokenSplitter} + * + * @author EVE + * + */ +public class SimpleTokenSplitter implements TokenSplitter { + /* + * This string is a format template for the delimiter matching regex + * + * It does two things: + * + *
  1. Match to the left of the provided delimiter by positive + * lookahead
  2. Match to the right of the provided delimiter by + * positive lookbehind
+ * + * Thus, it will only match in places where the delimiter is, but won't + * actually match the delimiter, leaving split to put it into the stream + */ + private static String WITH_DELIM = "(?:(?<=%1$s)|(?=%1$s))"; + + /* + * This string is a format template for the multi-delimiter matching + * regex. + * + * It does the same thing as the single delimiter regex, but has to have + * some negative lookahead/lookbehind assertions to avoid splitting a + * delimiter into pieces. + */ + private static String WITH_MULTI_DELIM = "(?:(?<=%1$s+)(?!%1$s)|(? delimSet; + private Set multidelimSet; + private Set exclusionSet; + + /** + * Create a new token splitter. + */ + public SimpleTokenSplitter() { + delimSet = new HashSet<>(); + multidelimSet = new HashSet<>(); + exclusionSet = new HashSet<>(); + } + + @Override + public String[] split(String inp) { + if(compPatt == null) throw new IllegalStateException("Token splitter has not been compiled yet"); + + /* + * Don't split something that we should exclude from being + * split. + */ + if(exclusionPatt.matcher(inp).matches()) return new String[] { inp }; + + return compPatt.split(inp); + } + + /** + * Adds one or more strings as matched delimiters to split on. + * + * Only works for fixed length delimiters. + * + * The provided strings are regex-escaped before being used. + * + * @param delims + * The delimiters to match on. + */ + public void addDelimiter(String... delims) { + for(String delim : delims) { + if(delim == null) throw new NullPointerException("Delim must not be null"); + + String quoteDelim = Pattern.quote(delim); + String delimPat = String.format(WITH_DELIM, quoteDelim); + + if(currPatt == null) { + currPatt = new StringBuilder(); + currExclusionPatt = new StringBuilder(); + + currPatt.append("(?:" + delimPat + ")"); + currExclusionPatt.append("(?:" + quoteDelim + ")"); + } else { + currPatt.append("|(?:" + delimPat + ")"); + currExclusionPatt.append("|(?:" + quoteDelim + ")"); + } + + delimSet.add(delim); + } + } + + /** + * Adds a character class as a matched delimiter to split on. + * + * The provided string should be a pattern to match one or more + * occurances of. + * + * @param delims + * The delimiter to split on. + */ + public void addMultiDelimiter(String... delims) { + for(String delim : delims) { + if(delim == null) throw new NullPointerException("Delim must not be null"); + + String delimPat = String.format(WITH_MULTI_DELIM, "(?:" + delim + ")"); + + if(currPatt == null) { + currPatt = new StringBuilder(); + currExclusionPatt = new StringBuilder(); + + currPatt.append("(?:" + delimPat + ")"); + currExclusionPatt.append("(?:(?:" + delim + ")+)"); + + } else { + currPatt.append("|(?:" + delimPat + ")"); + currExclusionPatt.append("|(?:(?:" + delim + ")+)"); + } + + multidelimSet.add(delim); + } + } + + /** + * Marks strings matching the pattern delim as non-splittable. + * + * @param delims + * The regex to not splitting matching strings. + */ + public void addNonMatcher(String... delims) { + for(String delim : delims) { + if(delim == null) throw new NullPointerException("Delim must not be null"); + + if(currPatt == null) { + currPatt = new StringBuilder(); + currExclusionPatt = new StringBuilder(); + + currExclusionPatt.append("(?:" + delim + ")"); + } else { + currExclusionPatt.append("|(?:" + delim + ")"); + } + + exclusionSet.add(delim); + } + } + + /** + * Compiles the current set of delimiters to a pattern. + * + * Makes this splitter ready to use. + */ + public void compile() { + if(currPatt == null) currPatt = new StringBuilder(); + if(currExclusionPatt == null) currExclusionPatt = new StringBuilder(); + + compPatt = Pattern.compile(currPatt.toString()); + exclusionPatt = Pattern.compile(currExclusionPatt.toString()); + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + + builder.append("SimpleTokenSplitter ["); + + if(currPatt != null) { + builder.append("currPatt="); + builder.append(currPatt); + builder.append("\n\t, "); + } + + if(currExclusionPatt != null) { + builder.append("currExclusionPatt="); + builder.append(currExclusionPatt); + builder.append("\n\t, "); + } + + if(compPatt != null) { + builder.append("compPatt="); + builder.append(compPatt); + builder.append("\n\t, "); + } + + if(exclusionPatt != null) { + builder.append("exclusionPatt="); + builder.append(exclusionPatt); + builder.append("\n\t, "); + } + + if(delimSet != null) { + builder.append("delimSet="); + builder.append(delimSet); + builder.append("\n\t, "); + } + + if(multidelimSet != null) { + builder.append("multidelimSet="); + builder.append(multidelimSet); + builder.append("\n\t, "); + } + + if(exclusionSet != null) { + builder.append("exclusionSet="); + builder.append(exclusionSet); + } + + builder.append("]"); + return builder.toString(); + } +} diff --git a/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TokenSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TokenSplitter.java new file mode 100644 index 0000000..e59d88e --- /dev/null +++ b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TokenSplitter.java @@ -0,0 +1,26 @@ +package bjc.utils.parserutils.splitter; + +/** + * Split a string and keep given delimiters. + * + * @author Ben Culkin + */ +public interface TokenSplitter { + /** + * Split a provided string using configured delimiters, and keeping the + * delimiters. + * + *

+ * The splitter must be compiled first. + *

+ * + * @param inp + * The string to split. + * + * @return The split string, including delimiters. + * + * @throws IllegalStateException + * If the splitter isn't compiled. + */ + String[] split(String inp); +} \ No newline at end of file diff --git a/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TwoLevelSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TwoLevelSplitter.java new file mode 100644 index 0000000..38f303d --- /dev/null +++ b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TwoLevelSplitter.java @@ -0,0 +1,110 @@ +package bjc.utils.parserutils.splitter; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +/** + * Implementation of a splitter that runs in two passes. + * + * This is useful because {@link SimpleTokenSplitter} doesn't like handling both + * <= and = without mangling them. + * + * The first pass splits on compound operators, which are built up from simple + * operators. + * + * The second pass removes simple operators. + * + * @author EVE + * + */ +public class TwoLevelSplitter implements TokenSplitter { + private SimpleTokenSplitter high; + private SimpleTokenSplitter low; + + /** + * Create a new two level splitter. + */ + public TwoLevelSplitter() { + high = new SimpleTokenSplitter(); + low = new SimpleTokenSplitter(); + } + + @Override + public String[] split(String inp) { + List ret = new ArrayList<>(); + + String[] partials = high.split(inp); + + for(String partial : partials) { + String[] finals = low.split(partial); + + for(String fin : finals) { + ret.add(fin); + } + } + + return ret.toArray(new String[ret.size()]); + } + + /** + * Adds compound operators to split on. + * + * @param delims + * The compound operators to split on. + */ + public void addCompoundDelim(String... delims) { + for(String delim : delims) { + high.addDelimiter(delim); + + low.addNonMatcher(Pattern.quote(delim)); + } + } + + /** + * Adds simple operators to split on. + * + * @param delims + * The simple operators to split on. + */ + public void addSimpleDelim(String... delims) { + for(String delim : delims) { + low.addDelimiter(delim); + } + } + + /** + * Adds repeated compound operators to split on. + * + * @param delims + * The repeated compound operators to split on. + */ + public void addCompoundMulti(String... delims) { + for(String delim : delims) { + high.addMultiDelimiter(delim); + + low.addNonMatcher("(?:" + delim + ")+"); + } + } + + /** + * Adds simple compound operators to split on. + * + * @param delims + * The repeated simple operators to split on. + */ + public void addSimpleMulti(String... delims) { + for(String delim : delims) { + low.addMultiDelimiter(delim); + } + } + + /** + * Ready the splitter for use. + */ + public void compile() { + high.compile(); + + low.compile(); + } +} -- cgit v1.2.3