Mercurial > pub > ImplabNet
comparison Implab/Formats/JSON/JsonGrammar.cs @ 228:6fa235c5a760 v2
Rewritten JsonScanner, JsonParser, fixed naming style
author | cin |
---|---|
date | Tue, 12 Sep 2017 01:19:12 +0300 (2017-09-11) |
parents | Implab/Formats/JSON/JSONGrammar.cs@4f82e0f161c3 |
children |
comparison
equal
deleted
inserted
replaced
227:8d5de4eb9c2c | 228:6fa235c5a760 |
---|---|
1 using System.Linq; | |
2 using Implab.Automaton.RegularExpressions; | |
3 using System; | |
4 using Implab.Automaton; | |
5 using Implab.Components; | |
6 | |
7 namespace Implab.Formats.Json { | |
8 public class JsonGrammar : Grammar<char> { | |
9 public enum TokenType { | |
10 None, | |
11 BeginObject, | |
12 EndObject, | |
13 BeginArray, | |
14 EndArray, | |
15 String, | |
16 Number, | |
17 Literal, | |
18 NameSeparator, | |
19 ValueSeparator, | |
20 Whitespace, | |
21 | |
22 StringBound, | |
23 EscapedChar, | |
24 UnescapedChar, | |
25 EscapedUnicode | |
26 } | |
27 | |
28 static LazyAndWeak<JsonGrammar> _instance = new LazyAndWeak<JsonGrammar>(() => new JsonGrammar()); | |
29 | |
30 public static JsonGrammar Instance { | |
31 get { return _instance.Value; } | |
32 } | |
33 | |
34 readonly InputScanner<TokenType> m_jsonExpression; | |
35 readonly InputScanner<TokenType> m_stringExpression; | |
36 readonly CharAlphabet m_defaultAlphabet = new CharAlphabet(); | |
37 | |
38 public CharAlphabet DefaultAlphabet { get { return m_defaultAlphabet; } } | |
39 | |
40 public JsonGrammar() { | |
41 DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x)); | |
42 var hexDigit = SymbolRangeToken('a','f').Or(SymbolRangeToken('A','F')).Or(SymbolRangeToken('0','9')); | |
43 var digit9 = SymbolRangeToken('1', '9'); | |
44 var zero = SymbolToken('0'); | |
45 var digit = zero.Or(digit9); | |
46 var dot = SymbolToken('.'); | |
47 var minus = SymbolToken('-'); | |
48 var sign = SymbolSetToken('-', '+'); | |
49 var expSign = SymbolSetToken('e', 'E'); | |
50 var letters = SymbolRangeToken('a', 'z'); | |
51 var integer = zero.Or(digit9.Cat(digit.EClosure())); | |
52 var frac = dot.Cat(digit.Closure()); | |
53 var exp = expSign.Cat(sign.Optional()).Cat(digit.Closure()); | |
54 var quote = SymbolToken('"'); | |
55 var backSlash = SymbolToken('\\'); | |
56 var specialEscapeChars = SymbolSetToken('\\', '"', '/', 'b', 'f', 't', 'n', 'r'); | |
57 var unicodeEspace = SymbolToken('u').Cat(hexDigit.Repeat(4)); | |
58 var whitespace = SymbolSetToken('\n', '\r', '\t', ' ').EClosure(); | |
59 var beginObject = whitespace.Cat(SymbolToken('{')).Cat(whitespace); | |
60 var endObject = whitespace.Cat(SymbolToken('}')).Cat(whitespace); | |
61 var beginArray = whitespace.Cat(SymbolToken('[')).Cat(whitespace); | |
62 var endArray = whitespace.Cat(SymbolToken(']')).Cat(whitespace); | |
63 var nameSep = whitespace.Cat(SymbolToken(':')).Cat(whitespace); | |
64 var valueSep = whitespace.Cat(SymbolToken(',')).Cat(whitespace); | |
65 | |
66 var number = minus.Optional().Cat(integer).Cat(frac.Optional()).Cat(exp.Optional()); | |
67 var literal = letters.Closure(); | |
68 var unescaped = SymbolTokenExcept(Enumerable.Range(0, 0x20).Union(new int[] { '\\', '"' }).Select(x => (char)x)); | |
69 | |
70 var jsonExpression = | |
71 number.Tag(TokenType.Number) | |
72 .Or(literal.Tag(TokenType.Literal)) | |
73 .Or(quote.Tag(TokenType.StringBound)) | |
74 .Or(beginObject.Tag(TokenType.BeginObject)) | |
75 .Or(endObject.Tag(TokenType.EndObject)) | |
76 .Or(beginArray.Tag(TokenType.BeginArray)) | |
77 .Or(endArray.Tag(TokenType.EndArray)) | |
78 .Or(nameSep.Tag(TokenType.NameSeparator)) | |
79 .Or(valueSep.Tag(TokenType.ValueSeparator)) | |
80 .Or(SymbolSetToken('\n', '\r', '\t', ' ').Closure().Tag(TokenType.Whitespace)); | |
81 | |
82 | |
83 var jsonStringExpression = | |
84 quote.Tag(TokenType.StringBound) | |
85 .Or(backSlash.Cat(specialEscapeChars).Tag(TokenType.EscapedChar)) | |
86 .Or(backSlash.Cat(unicodeEspace).Tag(TokenType.EscapedUnicode)) | |
87 .Or(unescaped.Closure().Tag(TokenType.UnescapedChar)); | |
88 | |
89 | |
90 m_jsonExpression = BuildScanner(jsonExpression); | |
91 m_stringExpression = BuildScanner(jsonStringExpression); | |
92 } | |
93 | |
94 public static InputScanner<TokenType> CreateJsonExpressionScanner() { | |
95 return Instance.m_jsonExpression.Clone(); | |
96 } | |
97 | |
98 public static InputScanner<TokenType> CreateStringExpressionScanner() { | |
99 return Instance.m_stringExpression.Clone(); | |
100 } | |
101 | |
102 protected override IAlphabetBuilder<char> AlphabetBuilder { | |
103 get { | |
104 return m_defaultAlphabet; | |
105 } | |
106 } | |
107 | |
108 Token SymbolRangeToken(char start, char stop) { | |
109 return SymbolToken(Enumerable.Range(start, stop - start + 1).Select(x => (char)x)); | |
110 } | |
111 | |
112 public InputScanner<TokenType> BuildScanner(Token regexp) { | |
113 var dfa = new RegularDFA<char, TokenType>(AlphabetBuilder); | |
114 | |
115 var visitor = new RegularExpressionVisitor<TokenType>(dfa); | |
116 regexp.Accept(visitor); | |
117 visitor.BuildDFA(); | |
118 | |
119 if (dfa.IsFinalState(dfa.InitialState)) | |
120 throw new ApplicationException("The specified language contains empty token"); | |
121 | |
122 var ab = new CharAlphabet(); | |
123 var optimal = dfa.Optimize(ab); | |
124 | |
125 return new InputScanner<TokenType>( | |
126 optimal.CreateTransitionTable(), | |
127 optimal.CreateFinalStateTable(), | |
128 NormalizeTags(optimal.CreateTagTable()), | |
129 optimal.InitialState, | |
130 ab.CreateCharMap() | |
131 ); | |
132 } | |
133 | |
134 static TokenType[] NormalizeTags(TokenType[][] tags) { | |
135 var result = new TokenType[tags.Length]; | |
136 for(var i = 0; i< tags.Length; i++) { | |
137 if (tags[i] == null || tags[i].Length == 0) | |
138 result[i] = default(TokenType); | |
139 else if (tags[i].Length == 1) | |
140 result[i] = tags[i][0]; | |
141 else | |
142 throw new Exception($"Ambigous state tags {string.Join(", ", tags[i])}"); | |
143 } | |
144 return result; | |
145 } | |
146 | |
147 } | |
148 } |