annotate Implab/Formats/Json/JsonGrammar.cs @ 239:eedf4d834e67 v2

fix
author cin
date Wed, 13 Dec 2017 19:54:45 +0300
parents 302ca905c19e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
163
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
1 using System.Linq;
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
2 using Implab.Automaton.RegularExpressions;
165
e227e78d72e4 DFA refactoring
cin
parents: 163
diff changeset
3 using System;
172
92d5278d1b10 Working on text scanner
cin
parents: 165
diff changeset
4 using Implab.Automaton;
180
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
5 using Implab.Components;
163
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
6
228
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
7 namespace Implab.Formats.Json {
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
8 public class JsonGrammar : Grammar<char> {
163
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
9 public enum TokenType {
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
10 None,
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
11 BeginObject,
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
12 EndObject,
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
13 BeginArray,
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
14 EndArray,
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
15 String,
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
16 Number,
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
17 Literal,
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
18 NameSeparator,
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
19 ValueSeparator,
183
4f82e0f161c3 fixed DFA optimization, JSON is fully functional
cin
parents: 182
diff changeset
20 Whitespace,
163
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
21
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
22 StringBound,
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
23 EscapedChar,
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
24 UnescapedChar,
176
0c3c69fe225b rewritten the text scanner
cin
parents: 172
diff changeset
25 EscapedUnicode
163
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
26 }
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
27
228
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
28 static LazyAndWeak<JsonGrammar> _instance = new LazyAndWeak<JsonGrammar>(() => new JsonGrammar());
165
e227e78d72e4 DFA refactoring
cin
parents: 163
diff changeset
29
228
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
30 public static JsonGrammar Instance {
165
e227e78d72e4 DFA refactoring
cin
parents: 163
diff changeset
31 get { return _instance.Value; }
e227e78d72e4 DFA refactoring
cin
parents: 163
diff changeset
32 }
e227e78d72e4 DFA refactoring
cin
parents: 163
diff changeset
33
236
302ca905c19e JsonReader optimizations
cin
parents: 230
diff changeset
34 readonly FastInputScanner<TokenType> m_jsonExpression;
302ca905c19e JsonReader optimizations
cin
parents: 230
diff changeset
35 readonly FastInputScanner<TokenType> m_stringExpression;
180
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
36 readonly CharAlphabet m_defaultAlphabet = new CharAlphabet();
163
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
37
228
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
38 public CharAlphabet DefaultAlphabet { get { return m_defaultAlphabet; } }
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
39
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
40 public JsonGrammar() {
163
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
41 DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x));
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
42 var hexDigit = SymbolRangeToken('a','f').Or(SymbolRangeToken('A','F')).Or(SymbolRangeToken('0','9'));
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
43 var digit9 = SymbolRangeToken('1', '9');
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
44 var zero = SymbolToken('0');
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
45 var digit = zero.Or(digit9);
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
46 var dot = SymbolToken('.');
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
47 var minus = SymbolToken('-');
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
48 var sign = SymbolSetToken('-', '+');
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
49 var expSign = SymbolSetToken('e', 'E');
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
50 var letters = SymbolRangeToken('a', 'z');
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
51 var integer = zero.Or(digit9.Cat(digit.EClosure()));
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
52 var frac = dot.Cat(digit.Closure());
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
53 var exp = expSign.Cat(sign.Optional()).Cat(digit.Closure());
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
54 var quote = SymbolToken('"');
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
55 var backSlash = SymbolToken('\\');
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
56 var specialEscapeChars = SymbolSetToken('\\', '"', '/', 'b', 'f', 't', 'n', 'r');
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
57 var unicodeEspace = SymbolToken('u').Cat(hexDigit.Repeat(4));
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
58 var whitespace = SymbolSetToken('\n', '\r', '\t', ' ').EClosure();
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
59 var beginObject = whitespace.Cat(SymbolToken('{')).Cat(whitespace);
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
60 var endObject = whitespace.Cat(SymbolToken('}')).Cat(whitespace);
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
61 var beginArray = whitespace.Cat(SymbolToken('[')).Cat(whitespace);
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
62 var endArray = whitespace.Cat(SymbolToken(']')).Cat(whitespace);
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
63 var nameSep = whitespace.Cat(SymbolToken(':')).Cat(whitespace);
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
64 var valueSep = whitespace.Cat(SymbolToken(',')).Cat(whitespace);
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
65
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
66 var number = minus.Optional().Cat(integer).Cat(frac.Optional()).Cat(exp.Optional());
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
67 var literal = letters.Closure();
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
68 var unescaped = SymbolTokenExcept(Enumerable.Range(0, 0x20).Union(new int[] { '\\', '"' }).Select(x => (char)x));
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
69
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
70 var jsonExpression =
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
71 number.Tag(TokenType.Number)
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
72 .Or(literal.Tag(TokenType.Literal))
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
73 .Or(quote.Tag(TokenType.StringBound))
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
74 .Or(beginObject.Tag(TokenType.BeginObject))
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
75 .Or(endObject.Tag(TokenType.EndObject))
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
76 .Or(beginArray.Tag(TokenType.BeginArray))
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
77 .Or(endArray.Tag(TokenType.EndArray))
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
78 .Or(nameSep.Tag(TokenType.NameSeparator))
183
4f82e0f161c3 fixed DFA optimization, JSON is fully functional
cin
parents: 182
diff changeset
79 .Or(valueSep.Tag(TokenType.ValueSeparator))
4f82e0f161c3 fixed DFA optimization, JSON is fully functional
cin
parents: 182
diff changeset
80 .Or(SymbolSetToken('\n', '\r', '\t', ' ').Closure().Tag(TokenType.Whitespace));
163
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
81
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
82
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
83 var jsonStringExpression =
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
84 quote.Tag(TokenType.StringBound)
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
85 .Or(backSlash.Cat(specialEscapeChars).Tag(TokenType.EscapedChar))
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
86 .Or(backSlash.Cat(unicodeEspace).Tag(TokenType.EscapedUnicode))
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
87 .Or(unescaped.Closure().Tag(TokenType.UnescapedChar));
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
88
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
89
236
302ca905c19e JsonReader optimizations
cin
parents: 230
diff changeset
90 m_jsonExpression = BuildFastScanner(jsonExpression);
302ca905c19e JsonReader optimizations
cin
parents: 230
diff changeset
91 m_stringExpression = BuildFastScanner(jsonStringExpression);
228
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
92 }
178
d5c5db0335ee working on JSON parser
cin
parents: 176
diff changeset
93
236
302ca905c19e JsonReader optimizations
cin
parents: 230
diff changeset
94 public static FastInputScanner<TokenType> CreateJsonExpressionScanner() {
228
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
95 return Instance.m_jsonExpression.Clone();
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
96 }
178
d5c5db0335ee working on JSON parser
cin
parents: 176
diff changeset
97
236
302ca905c19e JsonReader optimizations
cin
parents: 230
diff changeset
98 public static FastInputScanner<TokenType> CreateStringExpressionScanner() {
228
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
99 return Instance.m_stringExpression.Clone();
163
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
100 }
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
101
180
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
102 protected override IAlphabetBuilder<char> AlphabetBuilder {
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
103 get {
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
104 return m_defaultAlphabet;
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
105 }
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
106 }
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
107
178
d5c5db0335ee working on JSON parser
cin
parents: 176
diff changeset
108 Token SymbolRangeToken(char start, char stop) {
182
76e8f2ba12b8 pretty print DFA, the minimization is still buggy
cin
parents: 181
diff changeset
109 return SymbolToken(Enumerable.Range(start, stop - start + 1).Select(x => (char)x));
165
e227e78d72e4 DFA refactoring
cin
parents: 163
diff changeset
110 }
172
92d5278d1b10 Working on text scanner
cin
parents: 165
diff changeset
111
236
302ca905c19e JsonReader optimizations
cin
parents: 230
diff changeset
112 public FastInputScanner<TokenType> BuildFastScanner(Token regexp) {
228
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
113 var dfa = new RegularDFA<char, TokenType>(AlphabetBuilder);
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
114
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
115 var visitor = new RegularExpressionVisitor<TokenType>(dfa);
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
116 regexp.Accept(visitor);
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
117 visitor.BuildDFA();
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
118
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
119 if (dfa.IsFinalState(dfa.InitialState))
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
120 throw new ApplicationException("The specified language contains empty token");
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
121
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
122 var ab = new CharAlphabet();
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
123 var optimal = dfa.Optimize(ab);
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
124
236
302ca905c19e JsonReader optimizations
cin
parents: 230
diff changeset
125 return new FastInputScanner<TokenType>(
228
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
126 optimal.CreateTransitionTable(),
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
127 optimal.CreateFinalStateTable(),
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
128 NormalizeTags(optimal.CreateTagTable()),
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
129 optimal.InitialState,
236
302ca905c19e JsonReader optimizations
cin
parents: 230
diff changeset
130 ab.GetTranslationMap()
228
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
131 );
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
132 }
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
133
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
134 static TokenType[] NormalizeTags(TokenType[][] tags) {
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
135 var result = new TokenType[tags.Length];
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
136 for(var i = 0; i< tags.Length; i++) {
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
137 if (tags[i] == null || tags[i].Length == 0)
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
138 result[i] = default(TokenType);
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
139 else if (tags[i].Length == 1)
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
140 result[i] = tags[i][0];
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
141 else
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
142 throw new Exception($"Ambigous state tags {string.Join(", ", tags[i])}");
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
143 }
6fa235c5a760 Rewritten JsonScanner, JsonParser, fixed naming style
cin
parents: 183
diff changeset
144 return result;
172
92d5278d1b10 Working on text scanner
cin
parents: 165
diff changeset
145 }
165
e227e78d72e4 DFA refactoring
cin
parents: 163
diff changeset
146
163
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
147 }
419aa51b04fd JSON moved to Formats namespace
cin
parents:
diff changeset
148 }