163
|
1 using System.Linq;
|
|
2 using Implab.Automaton.RegularExpressions;
|
165
|
3 using System;
|
172
|
4 using Implab.Automaton;
|
180
|
5 using Implab.Components;
|
163
|
6
|
228
|
7 namespace Implab.Formats.Json {
|
|
8 public class JsonGrammar : Grammar<char> {
|
163
|
9 public enum TokenType {
|
|
10 None,
|
|
11 BeginObject,
|
|
12 EndObject,
|
|
13 BeginArray,
|
|
14 EndArray,
|
|
15 String,
|
|
16 Number,
|
|
17 Literal,
|
|
18 NameSeparator,
|
|
19 ValueSeparator,
|
183
|
20 Whitespace,
|
163
|
21
|
|
22 StringBound,
|
|
23 EscapedChar,
|
|
24 UnescapedChar,
|
176
|
25 EscapedUnicode
|
163
|
26 }
|
|
27
|
228
|
28 static LazyAndWeak<JsonGrammar> _instance = new LazyAndWeak<JsonGrammar>(() => new JsonGrammar());
|
165
|
29
|
228
|
30 public static JsonGrammar Instance {
|
165
|
31 get { return _instance.Value; }
|
|
32 }
|
|
33
|
228
|
34 readonly InputScanner<TokenType> m_jsonExpression;
|
|
35 readonly InputScanner<TokenType> m_stringExpression;
|
180
|
36 readonly CharAlphabet m_defaultAlphabet = new CharAlphabet();
|
163
|
37
|
228
|
38 public CharAlphabet DefaultAlphabet { get { return m_defaultAlphabet; } }
|
|
39
|
|
40 public JsonGrammar() {
|
163
|
41 DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x));
|
|
42 var hexDigit = SymbolRangeToken('a','f').Or(SymbolRangeToken('A','F')).Or(SymbolRangeToken('0','9'));
|
|
43 var digit9 = SymbolRangeToken('1', '9');
|
|
44 var zero = SymbolToken('0');
|
|
45 var digit = zero.Or(digit9);
|
|
46 var dot = SymbolToken('.');
|
|
47 var minus = SymbolToken('-');
|
|
48 var sign = SymbolSetToken('-', '+');
|
|
49 var expSign = SymbolSetToken('e', 'E');
|
|
50 var letters = SymbolRangeToken('a', 'z');
|
|
51 var integer = zero.Or(digit9.Cat(digit.EClosure()));
|
|
52 var frac = dot.Cat(digit.Closure());
|
|
53 var exp = expSign.Cat(sign.Optional()).Cat(digit.Closure());
|
|
54 var quote = SymbolToken('"');
|
|
55 var backSlash = SymbolToken('\\');
|
|
56 var specialEscapeChars = SymbolSetToken('\\', '"', '/', 'b', 'f', 't', 'n', 'r');
|
|
57 var unicodeEspace = SymbolToken('u').Cat(hexDigit.Repeat(4));
|
|
58 var whitespace = SymbolSetToken('\n', '\r', '\t', ' ').EClosure();
|
|
59 var beginObject = whitespace.Cat(SymbolToken('{')).Cat(whitespace);
|
|
60 var endObject = whitespace.Cat(SymbolToken('}')).Cat(whitespace);
|
|
61 var beginArray = whitespace.Cat(SymbolToken('[')).Cat(whitespace);
|
|
62 var endArray = whitespace.Cat(SymbolToken(']')).Cat(whitespace);
|
|
63 var nameSep = whitespace.Cat(SymbolToken(':')).Cat(whitespace);
|
|
64 var valueSep = whitespace.Cat(SymbolToken(',')).Cat(whitespace);
|
|
65
|
|
66 var number = minus.Optional().Cat(integer).Cat(frac.Optional()).Cat(exp.Optional());
|
|
67 var literal = letters.Closure();
|
|
68 var unescaped = SymbolTokenExcept(Enumerable.Range(0, 0x20).Union(new int[] { '\\', '"' }).Select(x => (char)x));
|
|
69
|
|
70 var jsonExpression =
|
|
71 number.Tag(TokenType.Number)
|
|
72 .Or(literal.Tag(TokenType.Literal))
|
|
73 .Or(quote.Tag(TokenType.StringBound))
|
|
74 .Or(beginObject.Tag(TokenType.BeginObject))
|
|
75 .Or(endObject.Tag(TokenType.EndObject))
|
|
76 .Or(beginArray.Tag(TokenType.BeginArray))
|
|
77 .Or(endArray.Tag(TokenType.EndArray))
|
|
78 .Or(nameSep.Tag(TokenType.NameSeparator))
|
183
|
79 .Or(valueSep.Tag(TokenType.ValueSeparator))
|
|
80 .Or(SymbolSetToken('\n', '\r', '\t', ' ').Closure().Tag(TokenType.Whitespace));
|
163
|
81
|
|
82
|
|
83 var jsonStringExpression =
|
|
84 quote.Tag(TokenType.StringBound)
|
|
85 .Or(backSlash.Cat(specialEscapeChars).Tag(TokenType.EscapedChar))
|
|
86 .Or(backSlash.Cat(unicodeEspace).Tag(TokenType.EscapedUnicode))
|
|
87 .Or(unescaped.Closure().Tag(TokenType.UnescapedChar));
|
|
88
|
|
89
|
228
|
90 m_jsonExpression = BuildScanner(jsonExpression);
|
|
91 m_stringExpression = BuildScanner(jsonStringExpression);
|
|
92 }
|
178
|
93
|
228
|
94 public static InputScanner<TokenType> CreateJsonExpressionScanner() {
|
|
95 return Instance.m_jsonExpression.Clone();
|
|
96 }
|
178
|
97
|
228
|
98 public static InputScanner<TokenType> CreateStringExpressionScanner() {
|
|
99 return Instance.m_stringExpression.Clone();
|
163
|
100 }
|
|
101
|
180
|
102 protected override IAlphabetBuilder<char> AlphabetBuilder {
|
|
103 get {
|
|
104 return m_defaultAlphabet;
|
|
105 }
|
|
106 }
|
|
107
|
178
|
108 Token SymbolRangeToken(char start, char stop) {
|
182
|
109 return SymbolToken(Enumerable.Range(start, stop - start + 1).Select(x => (char)x));
|
165
|
110 }
|
172
|
111
|
228
|
112 public InputScanner<TokenType> BuildScanner(Token regexp) {
|
|
113 var dfa = new RegularDFA<char, TokenType>(AlphabetBuilder);
|
|
114
|
|
115 var visitor = new RegularExpressionVisitor<TokenType>(dfa);
|
|
116 regexp.Accept(visitor);
|
|
117 visitor.BuildDFA();
|
|
118
|
|
119 if (dfa.IsFinalState(dfa.InitialState))
|
|
120 throw new ApplicationException("The specified language contains empty token");
|
|
121
|
|
122 var ab = new CharAlphabet();
|
|
123 var optimal = dfa.Optimize(ab);
|
|
124
|
|
125 return new InputScanner<TokenType>(
|
|
126 optimal.CreateTransitionTable(),
|
|
127 optimal.CreateFinalStateTable(),
|
|
128 NormalizeTags(optimal.CreateTagTable()),
|
|
129 optimal.InitialState,
|
|
130 ab.CreateCharMap()
|
|
131 );
|
|
132 }
|
|
133
|
|
134 static TokenType[] NormalizeTags(TokenType[][] tags) {
|
|
135 var result = new TokenType[tags.Length];
|
|
136 for(var i = 0; i< tags.Length; i++) {
|
|
137 if (tags[i] == null || tags[i].Length == 0)
|
|
138 result[i] = default(TokenType);
|
|
139 else if (tags[i].Length == 1)
|
|
140 result[i] = tags[i][0];
|
|
141 else
|
|
142 throw new Exception($"Ambigous state tags {string.Join(", ", tags[i])}");
|
|
143 }
|
|
144 return result;
|
172
|
145 }
|
165
|
146
|
163
|
147 }
|
|
148 }
|