diff options
| author | bjculkin <bjculkin@mix.wvu.edu> | 2017-03-17 08:34:57 -0400 |
|---|---|---|
| committer | bjculkin <bjculkin@mix.wvu.edu> | 2017-03-17 08:34:57 -0400 |
| commit | 9d89261fedf23c11b684eb66cefdd86a9378ad20 (patch) | |
| tree | 5158fdaedcd2951fbd41d49b72f7e09200fa3192 /BJC-Utils2/src/main/java/bjc/utils/parserutils/TokenSplitter.java | |
| parent | a63c30f5fe9ee302e73bb30e35095d789adb1a80 (diff) | |
Move parsing utilities.
Moved the parsing utilities SequenceDelimiter and TokenSplitter to the
parserutils package, instead of the funcutils package.
Diffstat (limited to 'BJC-Utils2/src/main/java/bjc/utils/parserutils/TokenSplitter.java')
| -rw-r--r-- | BJC-Utils2/src/main/java/bjc/utils/parserutils/TokenSplitter.java | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/BJC-Utils2/src/main/java/bjc/utils/parserutils/TokenSplitter.java b/BJC-Utils2/src/main/java/bjc/utils/parserutils/TokenSplitter.java new file mode 100644 index 0000000..e6191b9 --- /dev/null +++ b/BJC-Utils2/src/main/java/bjc/utils/parserutils/TokenSplitter.java @@ -0,0 +1,161 @@ +package bjc.utils.parserutils; + +import java.util.regex.Pattern; + +/** + * Split a string and keep given delimiters. + * + * @author Ben Culkin + */ +public class TokenSplitter { + /* + * This string is a format template for the delimiter matching regex + * + * It does two things: + * + * <ol> + * <li> Match to the left of the provided delimiter by positive lookahead </li> + * <li> Match to the right of the provided delimiter by positive lookbehind </li> + * </ol> + * + * Thus, it will only match in places where the delimiter is, but won't + * actually match the delimiter, leaving split to put it into the stream + */ + private static String WITH_DELIM = "(?:(?<=%1$s)|(?=%1$s))"; + + /* + * This string is a format template for the multi-delimiter matching + * regex. + * + * It does the same thing as the single delimiter regex, but has to have + * some negative lookahead/lookbehind assertions to avoid splitting a + * delimiter into pieces. + */ + private static String WITH_MULTI_DELIM = "(?:(?<=%1$s+)(?!%1$s)|(?<!%1$s)(?=%1$s+))"; + + /* + * These represent the internal state of the splitter. + */ + private StringBuilder currPatt; + private StringBuilder currExclusionPatt; + + /* + * These represent the external state of the splitter. + * + * Compilation causes internal to become external. + */ + private Pattern compPatt; + private Pattern exclusionPatt; + + /** + * Create a new token splitter. + */ + public TokenSplitter() { + } + + /** + * Split a provided string using configured delimiters, and keeping the + * delimiters. + * + * <p> + * The splitter must be compiled first. + * </p> + * + * @param inp + * The string to split. + * + * @return The split string, including delimiters. + * + * @throws IllegalStateException + * If the splitter isn't compiled. + */ + public String[] split(String inp) { + if(compPatt == null) throw new IllegalStateException("Token splitter has not been compiled yet"); + + /* + * Don't split something that we should exclude from being split. + */ + if(exclusionPatt.matcher(inp).matches()) return new String[] { inp }; + + return compPatt.split(inp); + } + + /** + * Adds one or more strings as matched delimiters to split on. + * + * Only works for fixed length delimiters. + * + * The provided strings are regex-escaped before being used. + * + * @param delims + * The delimiters to match on. + */ + public void addDelimiter(String... delims) { + for(String delim : delims) { + String quoteDelim = Pattern.quote(delim); + String delimPat = String.format(WITH_DELIM, quoteDelim); + + if(currPatt == null) { + currPatt = new StringBuilder(); + currExclusionPatt = new StringBuilder(); + + currPatt.append("(?:" + delimPat + ")"); + currExclusionPatt.append("(?:" + quoteDelim + ")"); + } else { + currPatt.append("|(?:" + delimPat + ")"); + currExclusionPatt.append("|(?:" + quoteDelim + ")"); + } + } + } + + /** + * Adds a character class as a matched delimiter to split on. + * + * The provided string should be a pattern to match one or more + * occurances of. + * + * @param delim + * The delimiter to split on. + */ + public void addMultiDelimiter(String delim) { + String delimPat = String.format(WITH_MULTI_DELIM, "(?:" + delim + ")"); + + if(currPatt == null) { + currPatt = new StringBuilder(); + currExclusionPatt = new StringBuilder(); + + currPatt.append("(?:" + delimPat + ")"); + currExclusionPatt.append("(?:(?:" + delim + ")+)"); + + } else { + currPatt.append("|(?:" + delimPat + ")"); + currExclusionPatt.append("|(?:(?:" + delim + ")+)"); + } + } + + /** + * Marks strings matching the pattern delim as non-splittable. + * + * @param delim + * The regex to not splitting matching strings. + */ + public void addNonMatcher(String delim) { + if(currPatt == null) { + currPatt = new StringBuilder(); + currExclusionPatt = new StringBuilder(); + + currExclusionPatt.append("(?:" + delim + ")"); + } else { + currExclusionPatt.append("|(?:" + delim + ")"); + } + } + /** + * Compiles the current set of delimiters to a pattern. + * + * Makes this splitter ready to use. + */ + public void compile() { + compPatt = Pattern.compile(currPatt.toString()); + exclusionPatt = Pattern.compile(currExclusionPatt.toString()); + } +} |
