comparison Implab/Formats/JSON/JsonGrammar.cs @ 228:6fa235c5a760 v2

Rewritten JsonScanner, JsonParser, fixed naming style
author cin
date Tue, 12 Sep 2017 01:19:12 +0300 (2017-09-11)
parents Implab/Formats/JSON/JSONGrammar.cs@4f82e0f161c3
children
comparison
equal deleted inserted replaced
227:8d5de4eb9c2c 228:6fa235c5a760
1 using System.Linq;
2 using Implab.Automaton.RegularExpressions;
3 using System;
4 using Implab.Automaton;
5 using Implab.Components;
6
7 namespace Implab.Formats.Json {
8 public class JsonGrammar : Grammar<char> {
9 public enum TokenType {
10 None,
11 BeginObject,
12 EndObject,
13 BeginArray,
14 EndArray,
15 String,
16 Number,
17 Literal,
18 NameSeparator,
19 ValueSeparator,
20 Whitespace,
21
22 StringBound,
23 EscapedChar,
24 UnescapedChar,
25 EscapedUnicode
26 }
27
28 static LazyAndWeak<JsonGrammar> _instance = new LazyAndWeak<JsonGrammar>(() => new JsonGrammar());
29
30 public static JsonGrammar Instance {
31 get { return _instance.Value; }
32 }
33
34 readonly InputScanner<TokenType> m_jsonExpression;
35 readonly InputScanner<TokenType> m_stringExpression;
36 readonly CharAlphabet m_defaultAlphabet = new CharAlphabet();
37
38 public CharAlphabet DefaultAlphabet { get { return m_defaultAlphabet; } }
39
40 public JsonGrammar() {
41 DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x));
42 var hexDigit = SymbolRangeToken('a','f').Or(SymbolRangeToken('A','F')).Or(SymbolRangeToken('0','9'));
43 var digit9 = SymbolRangeToken('1', '9');
44 var zero = SymbolToken('0');
45 var digit = zero.Or(digit9);
46 var dot = SymbolToken('.');
47 var minus = SymbolToken('-');
48 var sign = SymbolSetToken('-', '+');
49 var expSign = SymbolSetToken('e', 'E');
50 var letters = SymbolRangeToken('a', 'z');
51 var integer = zero.Or(digit9.Cat(digit.EClosure()));
52 var frac = dot.Cat(digit.Closure());
53 var exp = expSign.Cat(sign.Optional()).Cat(digit.Closure());
54 var quote = SymbolToken('"');
55 var backSlash = SymbolToken('\\');
56 var specialEscapeChars = SymbolSetToken('\\', '"', '/', 'b', 'f', 't', 'n', 'r');
57 var unicodeEspace = SymbolToken('u').Cat(hexDigit.Repeat(4));
58 var whitespace = SymbolSetToken('\n', '\r', '\t', ' ').EClosure();
59 var beginObject = whitespace.Cat(SymbolToken('{')).Cat(whitespace);
60 var endObject = whitespace.Cat(SymbolToken('}')).Cat(whitespace);
61 var beginArray = whitespace.Cat(SymbolToken('[')).Cat(whitespace);
62 var endArray = whitespace.Cat(SymbolToken(']')).Cat(whitespace);
63 var nameSep = whitespace.Cat(SymbolToken(':')).Cat(whitespace);
64 var valueSep = whitespace.Cat(SymbolToken(',')).Cat(whitespace);
65
66 var number = minus.Optional().Cat(integer).Cat(frac.Optional()).Cat(exp.Optional());
67 var literal = letters.Closure();
68 var unescaped = SymbolTokenExcept(Enumerable.Range(0, 0x20).Union(new int[] { '\\', '"' }).Select(x => (char)x));
69
70 var jsonExpression =
71 number.Tag(TokenType.Number)
72 .Or(literal.Tag(TokenType.Literal))
73 .Or(quote.Tag(TokenType.StringBound))
74 .Or(beginObject.Tag(TokenType.BeginObject))
75 .Or(endObject.Tag(TokenType.EndObject))
76 .Or(beginArray.Tag(TokenType.BeginArray))
77 .Or(endArray.Tag(TokenType.EndArray))
78 .Or(nameSep.Tag(TokenType.NameSeparator))
79 .Or(valueSep.Tag(TokenType.ValueSeparator))
80 .Or(SymbolSetToken('\n', '\r', '\t', ' ').Closure().Tag(TokenType.Whitespace));
81
82
83 var jsonStringExpression =
84 quote.Tag(TokenType.StringBound)
85 .Or(backSlash.Cat(specialEscapeChars).Tag(TokenType.EscapedChar))
86 .Or(backSlash.Cat(unicodeEspace).Tag(TokenType.EscapedUnicode))
87 .Or(unescaped.Closure().Tag(TokenType.UnescapedChar));
88
89
90 m_jsonExpression = BuildScanner(jsonExpression);
91 m_stringExpression = BuildScanner(jsonStringExpression);
92 }
93
94 public static InputScanner<TokenType> CreateJsonExpressionScanner() {
95 return Instance.m_jsonExpression.Clone();
96 }
97
98 public static InputScanner<TokenType> CreateStringExpressionScanner() {
99 return Instance.m_stringExpression.Clone();
100 }
101
102 protected override IAlphabetBuilder<char> AlphabetBuilder {
103 get {
104 return m_defaultAlphabet;
105 }
106 }
107
108 Token SymbolRangeToken(char start, char stop) {
109 return SymbolToken(Enumerable.Range(start, stop - start + 1).Select(x => (char)x));
110 }
111
112 public InputScanner<TokenType> BuildScanner(Token regexp) {
113 var dfa = new RegularDFA<char, TokenType>(AlphabetBuilder);
114
115 var visitor = new RegularExpressionVisitor<TokenType>(dfa);
116 regexp.Accept(visitor);
117 visitor.BuildDFA();
118
119 if (dfa.IsFinalState(dfa.InitialState))
120 throw new ApplicationException("The specified language contains empty token");
121
122 var ab = new CharAlphabet();
123 var optimal = dfa.Optimize(ab);
124
125 return new InputScanner<TokenType>(
126 optimal.CreateTransitionTable(),
127 optimal.CreateFinalStateTable(),
128 NormalizeTags(optimal.CreateTagTable()),
129 optimal.InitialState,
130 ab.CreateCharMap()
131 );
132 }
133
134 static TokenType[] NormalizeTags(TokenType[][] tags) {
135 var result = new TokenType[tags.Length];
136 for(var i = 0; i< tags.Length; i++) {
137 if (tags[i] == null || tags[i].Length == 0)
138 result[i] = default(TokenType);
139 else if (tags[i].Length == 1)
140 result[i] = tags[i][0];
141 else
142 throw new Exception($"Ambigous state tags {string.Join(", ", tags[i])}");
143 }
144 return result;
145 }
146
147 }
148 }