163
+ − 1 using System.Linq;
+ − 2 using Implab.Automaton.RegularExpressions;
165
+ − 3 using System;
172
+ − 4 using Implab.Automaton;
180
+ − 5 using Implab.Components;
163
+ − 6
228
+ − 7 namespace Implab.Formats.Json {
+ − 8 public class JsonGrammar : Grammar<char> {
163
+ − 9 public enum TokenType {
+ − 10 None,
+ − 11 BeginObject,
+ − 12 EndObject,
+ − 13 BeginArray,
+ − 14 EndArray,
+ − 15 String,
+ − 16 Number,
+ − 17 Literal,
+ − 18 NameSeparator,
+ − 19 ValueSeparator,
183
+ − 20 Whitespace,
163
+ − 21
+ − 22 StringBound,
+ − 23 EscapedChar,
+ − 24 UnescapedChar,
176
+ − 25 EscapedUnicode
163
+ − 26 }
+ − 27
228
+ − 28 static LazyAndWeak<JsonGrammar> _instance = new LazyAndWeak<JsonGrammar>(() => new JsonGrammar());
165
+ − 29
228
+ − 30 public static JsonGrammar Instance {
165
+ − 31 get { return _instance.Value; }
+ − 32 }
+ − 33
236
+ − 34 readonly FastInputScanner<TokenType> m_jsonExpression;
+ − 35 readonly FastInputScanner<TokenType> m_stringExpression;
180
+ − 36 readonly CharAlphabet m_defaultAlphabet = new CharAlphabet();
163
+ − 37
228
+ − 38 public CharAlphabet DefaultAlphabet { get { return m_defaultAlphabet; } }
+ − 39
+ − 40 public JsonGrammar() {
163
+ − 41 DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x));
+ − 42 var hexDigit = SymbolRangeToken('a','f').Or(SymbolRangeToken('A','F')).Or(SymbolRangeToken('0','9'));
+ − 43 var digit9 = SymbolRangeToken('1', '9');
+ − 44 var zero = SymbolToken('0');
+ − 45 var digit = zero.Or(digit9);
+ − 46 var dot = SymbolToken('.');
+ − 47 var minus = SymbolToken('-');
+ − 48 var sign = SymbolSetToken('-', '+');
+ − 49 var expSign = SymbolSetToken('e', 'E');
+ − 50 var letters = SymbolRangeToken('a', 'z');
+ − 51 var integer = zero.Or(digit9.Cat(digit.EClosure()));
+ − 52 var frac = dot.Cat(digit.Closure());
+ − 53 var exp = expSign.Cat(sign.Optional()).Cat(digit.Closure());
+ − 54 var quote = SymbolToken('"');
+ − 55 var backSlash = SymbolToken('\\');
+ − 56 var specialEscapeChars = SymbolSetToken('\\', '"', '/', 'b', 'f', 't', 'n', 'r');
+ − 57 var unicodeEspace = SymbolToken('u').Cat(hexDigit.Repeat(4));
+ − 58 var whitespace = SymbolSetToken('\n', '\r', '\t', ' ').EClosure();
+ − 59 var beginObject = whitespace.Cat(SymbolToken('{')).Cat(whitespace);
+ − 60 var endObject = whitespace.Cat(SymbolToken('}')).Cat(whitespace);
+ − 61 var beginArray = whitespace.Cat(SymbolToken('[')).Cat(whitespace);
+ − 62 var endArray = whitespace.Cat(SymbolToken(']')).Cat(whitespace);
+ − 63 var nameSep = whitespace.Cat(SymbolToken(':')).Cat(whitespace);
+ − 64 var valueSep = whitespace.Cat(SymbolToken(',')).Cat(whitespace);
+ − 65
+ − 66 var number = minus.Optional().Cat(integer).Cat(frac.Optional()).Cat(exp.Optional());
+ − 67 var literal = letters.Closure();
+ − 68 var unescaped = SymbolTokenExcept(Enumerable.Range(0, 0x20).Union(new int[] { '\\', '"' }).Select(x => (char)x));
+ − 69
+ − 70 var jsonExpression =
+ − 71 number.Tag(TokenType.Number)
+ − 72 .Or(literal.Tag(TokenType.Literal))
+ − 73 .Or(quote.Tag(TokenType.StringBound))
+ − 74 .Or(beginObject.Tag(TokenType.BeginObject))
+ − 75 .Or(endObject.Tag(TokenType.EndObject))
+ − 76 .Or(beginArray.Tag(TokenType.BeginArray))
+ − 77 .Or(endArray.Tag(TokenType.EndArray))
+ − 78 .Or(nameSep.Tag(TokenType.NameSeparator))
183
+ − 79 .Or(valueSep.Tag(TokenType.ValueSeparator))
+ − 80 .Or(SymbolSetToken('\n', '\r', '\t', ' ').Closure().Tag(TokenType.Whitespace));
163
+ − 81
+ − 82
+ − 83 var jsonStringExpression =
+ − 84 quote.Tag(TokenType.StringBound)
+ − 85 .Or(backSlash.Cat(specialEscapeChars).Tag(TokenType.EscapedChar))
+ − 86 .Or(backSlash.Cat(unicodeEspace).Tag(TokenType.EscapedUnicode))
+ − 87 .Or(unescaped.Closure().Tag(TokenType.UnescapedChar));
+ − 88
+ − 89
236
+ − 90 m_jsonExpression = BuildFastScanner(jsonExpression);
+ − 91 m_stringExpression = BuildFastScanner(jsonStringExpression);
228
+ − 92 }
178
+ − 93
236
+ − 94 public static FastInputScanner<TokenType> CreateJsonExpressionScanner() {
228
+ − 95 return Instance.m_jsonExpression.Clone();
+ − 96 }
178
+ − 97
236
+ − 98 public static FastInputScanner<TokenType> CreateStringExpressionScanner() {
228
+ − 99 return Instance.m_stringExpression.Clone();
163
+ − 100 }
+ − 101
180
+ − 102 protected override IAlphabetBuilder<char> AlphabetBuilder {
+ − 103 get {
+ − 104 return m_defaultAlphabet;
+ − 105 }
+ − 106 }
+ − 107
178
+ − 108 Token SymbolRangeToken(char start, char stop) {
182
+ − 109 return SymbolToken(Enumerable.Range(start, stop - start + 1).Select(x => (char)x));
165
+ − 110 }
172
+ − 111
236
+ − 112 public FastInputScanner<TokenType> BuildFastScanner(Token regexp) {
228
+ − 113 var dfa = new RegularDFA<char, TokenType>(AlphabetBuilder);
+ − 114
+ − 115 var visitor = new RegularExpressionVisitor<TokenType>(dfa);
+ − 116 regexp.Accept(visitor);
+ − 117 visitor.BuildDFA();
+ − 118
+ − 119 if (dfa.IsFinalState(dfa.InitialState))
+ − 120 throw new ApplicationException("The specified language contains empty token");
+ − 121
+ − 122 var ab = new CharAlphabet();
+ − 123 var optimal = dfa.Optimize(ab);
+ − 124
236
+ − 125 return new FastInputScanner<TokenType>(
228
+ − 126 optimal.CreateTransitionTable(),
+ − 127 optimal.CreateFinalStateTable(),
+ − 128 NormalizeTags(optimal.CreateTagTable()),
+ − 129 optimal.InitialState,
236
+ − 130 ab.GetTranslationMap()
228
+ − 131 );
+ − 132 }
+ − 133
+ − 134 static TokenType[] NormalizeTags(TokenType[][] tags) {
+ − 135 var result = new TokenType[tags.Length];
+ − 136 for(var i = 0; i< tags.Length; i++) {
+ − 137 if (tags[i] == null || tags[i].Length == 0)
+ − 138 result[i] = default(TokenType);
+ − 139 else if (tags[i].Length == 1)
+ − 140 result[i] = tags[i][0];
+ − 141 else
+ − 142 throw new Exception($"Ambigous state tags {string.Join(", ", tags[i])}");
+ − 143 }
+ − 144 return result;
172
+ − 145 }
165
+ − 146
163
+ − 147 }
+ − 148 }