Mercurial > pub > ImplabNet
diff Implab/Formats/JSON/JsonGrammar.cs @ 228:6fa235c5a760 v2
Rewritten JsonScanner, JsonParser, fixed naming style
author | cin |
---|---|
date | Tue, 12 Sep 2017 01:19:12 +0300 |
parents | Implab/Formats/JSON/JSONGrammar.cs@4f82e0f161c3 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Formats/JSON/JsonGrammar.cs Tue Sep 12 01:19:12 2017 +0300 @@ -0,0 +1,148 @@ +using System.Linq; +using Implab.Automaton.RegularExpressions; +using System; +using Implab.Automaton; +using Implab.Components; + +namespace Implab.Formats.Json { + public class JsonGrammar : Grammar<char> { + public enum TokenType { + None, + BeginObject, + EndObject, + BeginArray, + EndArray, + String, + Number, + Literal, + NameSeparator, + ValueSeparator, + Whitespace, + + StringBound, + EscapedChar, + UnescapedChar, + EscapedUnicode + } + + static LazyAndWeak<JsonGrammar> _instance = new LazyAndWeak<JsonGrammar>(() => new JsonGrammar()); + + public static JsonGrammar Instance { + get { return _instance.Value; } + } + + readonly InputScanner<TokenType> m_jsonExpression; + readonly InputScanner<TokenType> m_stringExpression; + readonly CharAlphabet m_defaultAlphabet = new CharAlphabet(); + + public CharAlphabet DefaultAlphabet { get { return m_defaultAlphabet; } } + + public JsonGrammar() { + DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x)); + var hexDigit = SymbolRangeToken('a','f').Or(SymbolRangeToken('A','F')).Or(SymbolRangeToken('0','9')); + var digit9 = SymbolRangeToken('1', '9'); + var zero = SymbolToken('0'); + var digit = zero.Or(digit9); + var dot = SymbolToken('.'); + var minus = SymbolToken('-'); + var sign = SymbolSetToken('-', '+'); + var expSign = SymbolSetToken('e', 'E'); + var letters = SymbolRangeToken('a', 'z'); + var integer = zero.Or(digit9.Cat(digit.EClosure())); + var frac = dot.Cat(digit.Closure()); + var exp = expSign.Cat(sign.Optional()).Cat(digit.Closure()); + var quote = SymbolToken('"'); + var backSlash = SymbolToken('\\'); + var specialEscapeChars = SymbolSetToken('\\', '"', '/', 'b', 'f', 't', 'n', 'r'); + var unicodeEspace = SymbolToken('u').Cat(hexDigit.Repeat(4)); + var whitespace = SymbolSetToken('\n', '\r', '\t', ' ').EClosure(); + var beginObject = whitespace.Cat(SymbolToken('{')).Cat(whitespace); + var endObject = whitespace.Cat(SymbolToken('}')).Cat(whitespace); + var beginArray = whitespace.Cat(SymbolToken('[')).Cat(whitespace); + var endArray = whitespace.Cat(SymbolToken(']')).Cat(whitespace); + var nameSep = whitespace.Cat(SymbolToken(':')).Cat(whitespace); + var valueSep = whitespace.Cat(SymbolToken(',')).Cat(whitespace); + + var number = minus.Optional().Cat(integer).Cat(frac.Optional()).Cat(exp.Optional()); + var literal = letters.Closure(); + var unescaped = SymbolTokenExcept(Enumerable.Range(0, 0x20).Union(new int[] { '\\', '"' }).Select(x => (char)x)); + + var jsonExpression = + number.Tag(TokenType.Number) + .Or(literal.Tag(TokenType.Literal)) + .Or(quote.Tag(TokenType.StringBound)) + .Or(beginObject.Tag(TokenType.BeginObject)) + .Or(endObject.Tag(TokenType.EndObject)) + .Or(beginArray.Tag(TokenType.BeginArray)) + .Or(endArray.Tag(TokenType.EndArray)) + .Or(nameSep.Tag(TokenType.NameSeparator)) + .Or(valueSep.Tag(TokenType.ValueSeparator)) + .Or(SymbolSetToken('\n', '\r', '\t', ' ').Closure().Tag(TokenType.Whitespace)); + + + var jsonStringExpression = + quote.Tag(TokenType.StringBound) + .Or(backSlash.Cat(specialEscapeChars).Tag(TokenType.EscapedChar)) + .Or(backSlash.Cat(unicodeEspace).Tag(TokenType.EscapedUnicode)) + .Or(unescaped.Closure().Tag(TokenType.UnescapedChar)); + + + m_jsonExpression = BuildScanner(jsonExpression); + m_stringExpression = BuildScanner(jsonStringExpression); + } + + public static InputScanner<TokenType> CreateJsonExpressionScanner() { + return Instance.m_jsonExpression.Clone(); + } + + public static InputScanner<TokenType> CreateStringExpressionScanner() { + return Instance.m_stringExpression.Clone(); + } + + protected override IAlphabetBuilder<char> AlphabetBuilder { + get { + return m_defaultAlphabet; + } + } + + Token SymbolRangeToken(char start, char stop) { + return SymbolToken(Enumerable.Range(start, stop - start + 1).Select(x => (char)x)); + } + + public InputScanner<TokenType> BuildScanner(Token regexp) { + var dfa = new RegularDFA<char, TokenType>(AlphabetBuilder); + + var visitor = new RegularExpressionVisitor<TokenType>(dfa); + regexp.Accept(visitor); + visitor.BuildDFA(); + + if (dfa.IsFinalState(dfa.InitialState)) + throw new ApplicationException("The specified language contains empty token"); + + var ab = new CharAlphabet(); + var optimal = dfa.Optimize(ab); + + return new InputScanner<TokenType>( + optimal.CreateTransitionTable(), + optimal.CreateFinalStateTable(), + NormalizeTags(optimal.CreateTagTable()), + optimal.InitialState, + ab.CreateCharMap() + ); + } + + static TokenType[] NormalizeTags(TokenType[][] tags) { + var result = new TokenType[tags.Length]; + for(var i = 0; i< tags.Length; i++) { + if (tags[i] == null || tags[i].Length == 0) + result[i] = default(TokenType); + else if (tags[i].Length == 1) + result[i] = tags[i][0]; + else + throw new Exception($"Ambigous state tags {string.Join(", ", tags[i])}"); + } + return result; + } + + } +}