diff options
Diffstat (limited to 'BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter')
3 files changed, 371 insertions, 0 deletions
diff --git a/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/SimpleTokenSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/SimpleTokenSplitter.java new file mode 100644 index 0000000..8b078a9 --- /dev/null +++ b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/SimpleTokenSplitter.java @@ -0,0 +1,235 @@ +package bjc.utils.parserutils.splitter; + +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Pattern; + +/** + * Simple implementation of {@link TokenSplitter} + * + * @author EVE + * + */ +public class SimpleTokenSplitter implements TokenSplitter { + /* + * This string is a format template for the delimiter matching regex + * + * It does two things: + * + * <ol> <li> Match to the left of the provided delimiter by positive + * lookahead </li> <li> Match to the right of the provided delimiter by + * positive lookbehind </li> </ol> + * + * Thus, it will only match in places where the delimiter is, but won't + * actually match the delimiter, leaving split to put it into the stream + */ + private static String WITH_DELIM = "(?:(?<=%1$s)|(?=%1$s))"; + + /* + * This string is a format template for the multi-delimiter matching + * regex. + * + * It does the same thing as the single delimiter regex, but has to have + * some negative lookahead/lookbehind assertions to avoid splitting a + * delimiter into pieces. + */ + private static String WITH_MULTI_DELIM = "(?:(?<=%1$s+)(?!%1$s)|(?<!%1$s)(?=%1$s+))"; + + /* + * These represent the internal state of the splitter. + */ + private StringBuilder currPatt; + private StringBuilder currExclusionPatt; + + /* + * These represent the external state of the splitter. + * + * Compilation causes internal to become external. + */ + private Pattern compPatt; + private Pattern exclusionPatt; + + /* + * These represent info for debugging. + */ + private Set<String> delimSet; + private Set<String> multidelimSet; + private Set<String> exclusionSet; + + /** + * Create a new token splitter. + */ + public SimpleTokenSplitter() { + delimSet = new HashSet<>(); + multidelimSet = new HashSet<>(); + exclusionSet = new HashSet<>(); + } + + @Override + public String[] split(String inp) { + if(compPatt == null) throw new IllegalStateException("Token splitter has not been compiled yet"); + + /* + * Don't split something that we should exclude from being + * split. + */ + if(exclusionPatt.matcher(inp).matches()) return new String[] { inp }; + + return compPatt.split(inp); + } + + /** + * Adds one or more strings as matched delimiters to split on. + * + * Only works for fixed length delimiters. + * + * The provided strings are regex-escaped before being used. + * + * @param delims + * The delimiters to match on. + */ + public void addDelimiter(String... delims) { + for(String delim : delims) { + if(delim == null) throw new NullPointerException("Delim must not be null"); + + String quoteDelim = Pattern.quote(delim); + String delimPat = String.format(WITH_DELIM, quoteDelim); + + if(currPatt == null) { + currPatt = new StringBuilder(); + currExclusionPatt = new StringBuilder(); + + currPatt.append("(?:" + delimPat + ")"); + currExclusionPatt.append("(?:" + quoteDelim + ")"); + } else { + currPatt.append("|(?:" + delimPat + ")"); + currExclusionPatt.append("|(?:" + quoteDelim + ")"); + } + + delimSet.add(delim); + } + } + + /** + * Adds a character class as a matched delimiter to split on. + * + * The provided string should be a pattern to match one or more + * occurances of. + * + * @param delims + * The delimiter to split on. + */ + public void addMultiDelimiter(String... delims) { + for(String delim : delims) { + if(delim == null) throw new NullPointerException("Delim must not be null"); + + String delimPat = String.format(WITH_MULTI_DELIM, "(?:" + delim + ")"); + + if(currPatt == null) { + currPatt = new StringBuilder(); + currExclusionPatt = new StringBuilder(); + + currPatt.append("(?:" + delimPat + ")"); + currExclusionPatt.append("(?:(?:" + delim + ")+)"); + + } else { + currPatt.append("|(?:" + delimPat + ")"); + currExclusionPatt.append("|(?:(?:" + delim + ")+)"); + } + + multidelimSet.add(delim); + } + } + + /** + * Marks strings matching the pattern delim as non-splittable. + * + * @param delims + * The regex to not splitting matching strings. + */ + public void addNonMatcher(String... delims) { + for(String delim : delims) { + if(delim == null) throw new NullPointerException("Delim must not be null"); + + if(currPatt == null) { + currPatt = new StringBuilder(); + currExclusionPatt = new StringBuilder(); + + currExclusionPatt.append("(?:" + delim + ")"); + } else { + currExclusionPatt.append("|(?:" + delim + ")"); + } + + exclusionSet.add(delim); + } + } + + /** + * Compiles the current set of delimiters to a pattern. + * + * Makes this splitter ready to use. + */ + public void compile() { + if(currPatt == null) currPatt = new StringBuilder(); + if(currExclusionPatt == null) currExclusionPatt = new StringBuilder(); + + compPatt = Pattern.compile(currPatt.toString()); + exclusionPatt = Pattern.compile(currExclusionPatt.toString()); + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + + builder.append("SimpleTokenSplitter ["); + + if(currPatt != null) { + builder.append("currPatt="); + builder.append(currPatt); + builder.append("\n\t, "); + } + + if(currExclusionPatt != null) { + builder.append("currExclusionPatt="); + builder.append(currExclusionPatt); + builder.append("\n\t, "); + } + + if(compPatt != null) { + builder.append("compPatt="); + builder.append(compPatt); + builder.append("\n\t, "); + } + + if(exclusionPatt != null) { + builder.append("exclusionPatt="); + builder.append(exclusionPatt); + builder.append("\n\t, "); + } + + if(delimSet != null) { + builder.append("delimSet="); + builder.append(delimSet); + builder.append("\n\t, "); + } + + if(multidelimSet != null) { + builder.append("multidelimSet="); + builder.append(multidelimSet); + builder.append("\n\t, "); + } + + if(exclusionSet != null) { + builder.append("exclusionSet="); + builder.append(exclusionSet); + } + + builder.append("]"); + return builder.toString(); + } +} diff --git a/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TokenSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TokenSplitter.java new file mode 100644 index 0000000..e59d88e --- /dev/null +++ b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TokenSplitter.java @@ -0,0 +1,26 @@ +package bjc.utils.parserutils.splitter; + +/** + * Split a string and keep given delimiters. + * + * @author Ben Culkin + */ +public interface TokenSplitter { + /** + * Split a provided string using configured delimiters, and keeping the + * delimiters. + * + * <p> + * The splitter must be compiled first. + * </p> + * + * @param inp + * The string to split. + * + * @return The split string, including delimiters. + * + * @throws IllegalStateException + * If the splitter isn't compiled. + */ + String[] split(String inp); +}
\ No newline at end of file diff --git a/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TwoLevelSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TwoLevelSplitter.java new file mode 100644 index 0000000..38f303d --- /dev/null +++ b/BJC-Utils2/src/main/java/bjc/utils/parserutils/splitter/TwoLevelSplitter.java @@ -0,0 +1,110 @@ +package bjc.utils.parserutils.splitter; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +/** + * Implementation of a splitter that runs in two passes. + * + * This is useful because {@link SimpleTokenSplitter} doesn't like handling both + * <= and = without mangling them. + * + * The first pass splits on compound operators, which are built up from simple + * operators. + * + * The second pass removes simple operators. + * + * @author EVE + * + */ +public class TwoLevelSplitter implements TokenSplitter { + private SimpleTokenSplitter high; + private SimpleTokenSplitter low; + + /** + * Create a new two level splitter. + */ + public TwoLevelSplitter() { + high = new SimpleTokenSplitter(); + low = new SimpleTokenSplitter(); + } + + @Override + public String[] split(String inp) { + List<String> ret = new ArrayList<>(); + + String[] partials = high.split(inp); + + for(String partial : partials) { + String[] finals = low.split(partial); + + for(String fin : finals) { + ret.add(fin); + } + } + + return ret.toArray(new String[ret.size()]); + } + + /** + * Adds compound operators to split on. + * + * @param delims + * The compound operators to split on. + */ + public void addCompoundDelim(String... delims) { + for(String delim : delims) { + high.addDelimiter(delim); + + low.addNonMatcher(Pattern.quote(delim)); + } + } + + /** + * Adds simple operators to split on. + * + * @param delims + * The simple operators to split on. + */ + public void addSimpleDelim(String... delims) { + for(String delim : delims) { + low.addDelimiter(delim); + } + } + + /** + * Adds repeated compound operators to split on. + * + * @param delims + * The repeated compound operators to split on. + */ + public void addCompoundMulti(String... delims) { + for(String delim : delims) { + high.addMultiDelimiter(delim); + + low.addNonMatcher("(?:" + delim + ")+"); + } + } + + /** + * Adds simple compound operators to split on. + * + * @param delims + * The repeated simple operators to split on. + */ + public void addSimpleMulti(String... delims) { + for(String delim : delims) { + low.addMultiDelimiter(delim); + } + } + + /** + * Ready the splitter for use. + */ + public void compile() { + high.compile(); + + low.compile(); + } +} |
