From e36e3b56b9b736d7a5caa889f280f90688eb9ad7 Mon Sep 17 00:00:00 2001 From: bjculkin Date: Mon, 27 Feb 2017 10:48:18 -0500 Subject: Seperate tokenizer --- dice-lang/src/bjc/dicelang/v2/DiceLangEngine.java | 142 +------------------- dice-lang/src/bjc/dicelang/v2/Tokenizer.java | 154 ++++++++++++++++++++++ 2 files changed, 160 insertions(+), 136 deletions(-) create mode 100644 dice-lang/src/bjc/dicelang/v2/Tokenizer.java diff --git a/dice-lang/src/bjc/dicelang/v2/DiceLangEngine.java b/dice-lang/src/bjc/dicelang/v2/DiceLangEngine.java index 200c0b6..8a877af 100644 --- a/dice-lang/src/bjc/dicelang/v2/DiceLangEngine.java +++ b/dice-lang/src/bjc/dicelang/v2/DiceLangEngine.java @@ -1,6 +1,5 @@ package bjc.dicelang.v2; -import bjc.dicelang.v2.dice.DiceBox; import bjc.utils.data.IPair; import bjc.utils.data.ITree; import bjc.utils.data.Pair; @@ -10,7 +9,6 @@ import bjc.utils.funcdata.FunctionalStringTokenizer; import bjc.utils.funcdata.IList; import bjc.utils.funcdata.IMap; import bjc.utils.funcutils.ListUtils; -import bjc.utils.funcutils.StringUtils; import static bjc.dicelang.v2.Errors.ErrorKey.*; import static bjc.dicelang.v2.Token.Type.*; @@ -30,7 +28,6 @@ public class DiceLangEngine { // ID for generation private int nextLiteral; - private int nextSym; // Debug indicator private boolean debugMode; @@ -43,6 +40,8 @@ public class DiceLangEngine { // Shunter for token postfixing private Shunter shunt; + // Tokenizer for tokenizing + private Tokenizer tokenzer; // Parser for tree construction private Parser parsr; // Evaluator for evaluating @@ -52,8 +51,6 @@ public class DiceLangEngine { public final IMap symTable; public final IMap stringLits; - // Literal tokens for tokenization - private IMap litTokens; // Lists for preprocessing private IList lineDefns; @@ -91,22 +88,7 @@ public class DiceLangEngine { deaffixationList.add(new Pair<>("[", "\\[")); deaffixationList.add(new Pair<>("]", "\\]")); deaffixationList.add(new Pair<>("{", "\\{")); - deaffixationList.add(new Pair<>("}", "}")); - - litTokens = new FunctionalMap<>(); - - litTokens.put("+", ADD); - litTokens.put("-", SUBTRACT); - litTokens.put("*", MULTIPLY); - litTokens.put("/", DIVIDE); - litTokens.put("//", IDIVIDE); - litTokens.put("dg", DICEGROUP); - litTokens.put("dc", DICECONCAT); - litTokens.put("dl", DICELIST); - litTokens.put("=>", LET); - litTokens.put(":=", BIND); - litTokens.put(",", GROUPSEP); - litTokens.put("crc", COERCE); + deaffixationList.add(new Pair<>("}", "}")); nextLiteral = 1; @@ -116,8 +98,8 @@ public class DiceLangEngine { stepEval = false; streamEng = new StreamEngine(this); - shunt = new Shunter(); + tokenzer = new Tokenizer(this); parsr = new Parser(); eval = new Evaluator(this); } @@ -280,7 +262,7 @@ public class DiceLangEngine { newTok = dfn.apply(newTok); } - Token tk = lexToken(token, stringLiterals); + Token tk = tokenzer.lexToken(token, stringLiterals); if(tk == null) continue; else if(tk == Token.NIL_TOKEN) return false; @@ -467,119 +449,7 @@ public class DiceLangEngine { return true; } - private Token lexToken(String token, IMap stringLts) { - if(token.equals("")) return null; - - Token tk = Token.NIL_TOKEN; - - if(litTokens.containsKey(token)) { - tk = new Token(litTokens.get(token)); - } else { - switch(token.charAt(0)) { - case '(': - case ')': - case '[': - case ']': - case '{': - case '}': - tk = tokenizeGrouping(token); - break; - default: - tk = tokenizeLiteral(token, stringLts); - } - } - - return tk; - } - - private Token tokenizeGrouping(String token) { - Token tk = Token.NIL_TOKEN; - - if(StringUtils.containsOnly(token, "\\" + token.charAt(0))) { - switch(token.charAt(0)) { - case '(': - tk = new Token(OPAREN, token.length()); - break; - case ')': - tk = new Token(CPAREN, token.length()); - break; - case '[': - tk = new Token(OBRACKET, token.length()); - break; - case ']': - tk = new Token(CBRACKET, token.length()); - break; - case '{': - tk = new Token(OBRACE, token.length()); - break; - case '}': - tk = new Token(CBRACE, token.length()); - break; - default: - Errors.inst.printError(EK_TOK_UNGROUP, token); - break; - } - } - - return tk; - } - - private Pattern intMatcher = Pattern.compile("\\A[\\-\\+]?\\d+\\Z"); - private Pattern hexadecimalMatcher = Pattern.compile("\\A[\\-\\+]?0x[0-9A-Fa-f]+\\Z"); - private Pattern flexadecimalMatcher = Pattern.compile("\\A[\\-\\+]?[0-9][0-9A-Za-z]+B\\d{1,2}\\Z"); - private Pattern stringLitMatcher = Pattern.compile("\\AstringLiteral(\\d+)\\Z"); - - private Token tokenizeLiteral(String token, IMap stringLts) { - Token tk = Token.NIL_TOKEN; - - if(intMatcher.matcher(token).matches()) { - tk = new Token(INT_LIT, Long.parseLong(token)); - } else if(hexadecimalMatcher.matcher(token).matches()) { - String newToken = token.substring(0, 1) + token.substring(token.indexOf('x')); - - tk = new Token(INT_LIT, Long.parseLong(newToken.substring(2).toUpperCase(), 16)); - } else if(flexadecimalMatcher.matcher(token).matches()) { - int parseBase = Integer.parseInt(token.substring(token.lastIndexOf('B') + 1)); - - if(parseBase < Character.MIN_RADIX || parseBase > Character.MAX_RADIX) { - Errors.inst.printError(EK_TOK_INVBASE, Integer.toString(parseBase)); - return Token.NIL_TOKEN; - } - - String flexNum = token.substring(0, token.lastIndexOf('B')); - - try { - tk = new Token(INT_LIT, Long.parseLong(flexNum, parseBase)); - } catch (NumberFormatException nfex) { - Errors.inst.printError(EK_TOK_INVFLEX, flexNum, Integer.toString(parseBase)); - return Token.NIL_TOKEN; - } - } else if(DoubleMatcher.floatingLiteral.matcher(token).matches()) { - tk = new Token(FLOAT_LIT, Double.parseDouble(token)); - } else if(DiceBox.isValidExpression(token)) { - tk = new Token(DICE_LIT, DiceBox.parseExpression(token)); - } else { - Matcher stringLit = stringLitMatcher.matcher(token); - - if(stringLit.matches()) { - int litNum = Integer.parseInt(stringLit.group(1)); - - stringLits.put(litNum, stringLts.get(token)); - tk = new Token(STRING_LIT, litNum); - } else { - // @TODO define what a valid identifier is - symTable.put(nextSym++, token); - - tk = new Token(VREF, nextSym - 1); - } - - // @TODO uncomment when we have a defn. for var names - // System.out.printf("\tERROR: Unrecognized token:" - // + "%s\n", token); - } - - return tk; - } + private IList deaffixTokens(IList tokens, List> deaffixTokens) { Deque working = new LinkedList<>(); diff --git a/dice-lang/src/bjc/dicelang/v2/Tokenizer.java b/dice-lang/src/bjc/dicelang/v2/Tokenizer.java new file mode 100644 index 0000000..40b1285 --- /dev/null +++ b/dice-lang/src/bjc/dicelang/v2/Tokenizer.java @@ -0,0 +1,154 @@ +package bjc.dicelang.v2; + +import static bjc.dicelang.v2.Errors.ErrorKey.*; +import static bjc.dicelang.v2.Token.Type.*; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import bjc.dicelang.v2.dice.DiceBox; +import bjc.utils.funcdata.FunctionalMap; +import bjc.utils.funcdata.IMap; +import bjc.utils.funcutils.StringUtils; + +public class Tokenizer { + // Literal tokens for tokenization + private IMap litTokens; + + private DiceLangEngine eng; + + private int nextSym = 0; + + public Tokenizer(DiceLangEngine engine) { + eng = engine; + + litTokens = new FunctionalMap<>(); + + litTokens.put("+", ADD); + litTokens.put("-", SUBTRACT); + litTokens.put("*", MULTIPLY); + litTokens.put("/", DIVIDE); + litTokens.put("//", IDIVIDE); + litTokens.put("dg", DICEGROUP); + litTokens.put("dc", DICECONCAT); + litTokens.put("dl", DICELIST); + litTokens.put("=>", LET); + litTokens.put(":=", BIND); + litTokens.put(",", GROUPSEP); + litTokens.put("crc", COERCE); + } + + public Token lexToken(String token, IMap stringLts) { + if(token.equals("")) return null; + + Token tk = Token.NIL_TOKEN; + + if(litTokens.containsKey(token)) { + tk = new Token(litTokens.get(token)); + } else { + switch(token.charAt(0)) { + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + tk = tokenizeGrouping(token); + break; + default: + tk = tokenizeLiteral(token, stringLts); + } + } + + return tk; + } + + private Token tokenizeGrouping(String token) { + Token tk = Token.NIL_TOKEN; + + if(StringUtils.containsOnly(token, "\\" + token.charAt(0))) { + switch(token.charAt(0)) { + case '(': + tk = new Token(OPAREN, token.length()); + break; + case ')': + tk = new Token(CPAREN, token.length()); + break; + case '[': + tk = new Token(OBRACKET, token.length()); + break; + case ']': + tk = new Token(CBRACKET, token.length()); + break; + case '{': + tk = new Token(OBRACE, token.length()); + break; + case '}': + tk = new Token(CBRACE, token.length()); + break; + default: + Errors.inst.printError(EK_TOK_UNGROUP, token); + break; + } + } + + return tk; + } + + private Pattern intMatcher = Pattern.compile("\\A[\\-\\+]?\\d+\\Z"); + private Pattern hexadecimalMatcher = Pattern.compile("\\A[\\-\\+]?0x[0-9A-Fa-f]+\\Z"); + private Pattern flexadecimalMatcher = Pattern.compile("\\A[\\-\\+]?[0-9][0-9A-Za-z]+B\\d{1,2}\\Z"); + private Pattern stringLitMatcher = Pattern.compile("\\AstringLiteral(\\d+)\\Z"); + + private Token tokenizeLiteral(String token, IMap stringLts) { + Token tk = Token.NIL_TOKEN; + + if(intMatcher.matcher(token).matches()) { + tk = new Token(INT_LIT, Long.parseLong(token)); + } else if(hexadecimalMatcher.matcher(token).matches()) { + String newToken = token.substring(0, 1) + token.substring(token.indexOf('x')); + + tk = new Token(INT_LIT, Long.parseLong(newToken.substring(2).toUpperCase(), 16)); + } else if(flexadecimalMatcher.matcher(token).matches()) { + int parseBase = Integer.parseInt(token.substring(token.lastIndexOf('B') + 1)); + + if(parseBase < Character.MIN_RADIX || parseBase > Character.MAX_RADIX) { + Errors.inst.printError(EK_TOK_INVBASE, Integer.toString(parseBase)); + return Token.NIL_TOKEN; + } + + String flexNum = token.substring(0, token.lastIndexOf('B')); + + try { + tk = new Token(INT_LIT, Long.parseLong(flexNum, parseBase)); + } catch (NumberFormatException nfex) { + Errors.inst.printError(EK_TOK_INVFLEX, flexNum, Integer.toString(parseBase)); + return Token.NIL_TOKEN; + } + } else if(DoubleMatcher.floatingLiteral.matcher(token).matches()) { + tk = new Token(FLOAT_LIT, Double.parseDouble(token)); + } else if(DiceBox.isValidExpression(token)) { + tk = new Token(DICE_LIT, DiceBox.parseExpression(token)); + } else { + Matcher stringLit = stringLitMatcher.matcher(token); + + if(stringLit.matches()) { + int litNum = Integer.parseInt(stringLit.group(1)); + + eng.stringLits.put(litNum, stringLts.get(token)); + tk = new Token(STRING_LIT, litNum); + } else { + // @TODO define what a valid identifier is + eng.symTable.put(nextSym++, token); + + tk = new Token(VREF, nextSym - 1); + } + + // @TODO uncomment when we have a defn. for var names + // System.out.printf("\tERROR: Unrecognized token:" + // + "%s\n", token); + } + + return tk; + } +} -- cgit v1.2.3