diff options
| author | Benjamin J. Culkin <bjculkin@mix.wvu.edu> | 2017-10-25 12:10:14 -0300 |
|---|---|---|
| committer | Benjamin J. Culkin <bjculkin@mix.wvu.edu> | 2017-10-25 12:10:14 -0300 |
| commit | 7bda9de511a5642efb297eae98c6ea7c42b27754 (patch) | |
| tree | dff1aa772b9ac088c5bd07b8d10d944cbff89f96 /base/src/bjc/dicelang/Tokenizer.java | |
| parent | f028ea6dc555fc5192a96b00b8e96e90dbf6de55 (diff) | |
Start switch to maven modules
Diffstat (limited to 'base/src/bjc/dicelang/Tokenizer.java')
| -rw-r--r-- | base/src/bjc/dicelang/Tokenizer.java | 174 |
1 files changed, 174 insertions, 0 deletions
diff --git a/base/src/bjc/dicelang/Tokenizer.java b/base/src/bjc/dicelang/Tokenizer.java new file mode 100644 index 0000000..3e4a490 --- /dev/null +++ b/base/src/bjc/dicelang/Tokenizer.java @@ -0,0 +1,174 @@ +package bjc.dicelang; + +import static bjc.dicelang.Errors.ErrorKey.EK_TOK_INVBASE; +import static bjc.dicelang.Errors.ErrorKey.EK_TOK_INVFLEX; +import static bjc.dicelang.Errors.ErrorKey.EK_TOK_UNGROUP; +import static bjc.dicelang.Token.Type.*; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import bjc.dicelang.dice.DiceBox; +import bjc.utils.funcdata.FunctionalMap; +import bjc.utils.funcdata.IMap; +import bjc.utils.funcutils.StringUtils; +import bjc.utils.parserutils.TokenUtils; + +/** + * Converts strings into tokens. + */ +public class Tokenizer { + /* Literal tokens for tokenization */ + private final IMap<String, Token.Type> litTokens; + + private final DiceLangEngine eng; + + private int nextSym = 0; + + public Tokenizer(final DiceLangEngine engine) { + eng = engine; + + litTokens = new FunctionalMap<>(); + + litTokens.put("+", ADD); + litTokens.put("-", SUBTRACT); + litTokens.put("*", MULTIPLY); + litTokens.put("/", DIVIDE); + litTokens.put("//", IDIVIDE); + litTokens.put("sd", DICESCALAR); + litTokens.put("df", DICEFUDGE); + litTokens.put("dg", DICEGROUP); + litTokens.put("dc", DICECONCAT); + litTokens.put("dl", DICELIST); + litTokens.put("=>", LET); + litTokens.put(":=", BIND); + litTokens.put(".+.", STRCAT); + litTokens.put(".*.", STRREP); + litTokens.put(",", GROUPSEP); + litTokens.put("crc", COERCE); + } + + public Token lexToken(final String token, final IMap<String, String> stringLts) { + if (token.equals("")) { + return null; + } + + Token tk = Token.NIL_TOKEN; + + if (litTokens.containsKey(token)) { + tk = new Token(litTokens.get(token)); + } else { + switch (token.charAt(0)) { + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + tk = tokenizeGrouping(token); + break; + default: + tk = tokenizeLiteral(token, stringLts); + } + } + + return tk; + } + + private static Token tokenizeGrouping(final String token) { + Token tk = Token.NIL_TOKEN; + + if (StringUtils.containsOnly(token, "\\" + token.charAt(0))) { + /* Handle multiple-grouped delimiters. */ + switch (token.charAt(0)) { + case '(': + tk = new Token(OPAREN, token.length()); + break; + + case ')': + tk = new Token(CPAREN, token.length()); + break; + + case '[': + tk = new Token(OBRACKET, token.length()); + break; + + case ']': + tk = new Token(CBRACKET, token.length()); + break; + + case '{': + tk = new Token(OBRACE, token.length()); + break; + + case '}': + tk = new Token(CBRACE, token.length()); + break; + + default: + Errors.inst.printError(EK_TOK_UNGROUP, token); + break; + } + } + + return tk; + } + + /* Patterns for matching. */ + private final Pattern hexadecimalMatcher = + Pattern.compile("\\A[\\-\\+]?0x[0-9A-Fa-f]+\\Z"); + private final Pattern flexadecimalMatcher = + Pattern.compile("\\A[\\-\\+]?[0-9][0-9A-Za-z]+B\\d{1,2}\\Z"); + private final Pattern stringLitMatcher = + Pattern.compile("\\AstringLiteral(\\d+)\\Z"); + + /* Tokenize a literal value. */ + private Token tokenizeLiteral(final String rtoken, final IMap<String, String> stringLts) { + Token tk = Token.NIL_TOKEN; + + String token = rtoken.trim(); + + if (TokenUtils.isInt(token)) { + tk = new Token(INT_LIT, Long.parseLong(token)); + } else if (hexadecimalMatcher.matcher(token).matches()) { + final String newToken = token.substring(0, 1) + token.substring(token.indexOf('x')); + tk = new Token(INT_LIT, Long.parseLong(newToken.substring(2).toUpperCase(), 16)); + } else if (flexadecimalMatcher.matcher(token).matches()) { + final int parseBase = Integer.parseInt(token.substring(token.lastIndexOf('B') + 1)); + + if (parseBase < Character.MIN_RADIX || parseBase > Character.MAX_RADIX) { + Errors.inst.printError(EK_TOK_INVBASE, Integer.toString(parseBase)); + return Token.NIL_TOKEN; + } + + final String flexNum = token.substring(0, token.lastIndexOf('B')); + + try { + tk = new Token(INT_LIT, Long.parseLong(flexNum, parseBase)); + } catch (final NumberFormatException nfex) { + Errors.inst.printError(EK_TOK_INVFLEX, flexNum, Integer.toString(parseBase)); + return Token.NIL_TOKEN; + } + } else if (TokenUtils.isDouble(token)) { + tk = new Token(FLOAT_LIT, Double.parseDouble(token)); + } else if (DiceBox.isValidExpression(token)) { + tk = new Token(DICE_LIT, DiceBox.parseExpression(token)); + } else { + final Matcher stringLit = stringLitMatcher.matcher(token); + + if (stringLit.matches()) { + final int litNum = Integer.parseInt(stringLit.group(1)); + + eng.addStringLiteral(litNum, stringLts.get(token)); + tk = new Token(STRING_LIT, litNum); + } else { + /* Everything else is a symbol */ + eng.symTable.put(nextSym++, token); + + tk = new Token(VREF, nextSym - 1); + } + } + + return tk; + } +} |
