diff Implab/Formats/JSON/JsonGrammar.cs @ 228:6fa235c5a760 v2

Rewritten JsonScanner, JsonParser, fixed naming style
author cin
date Tue, 12 Sep 2017 01:19:12 +0300
parents Implab/Formats/JSON/JSONGrammar.cs@4f82e0f161c3
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Implab/Formats/JSON/JsonGrammar.cs	Tue Sep 12 01:19:12 2017 +0300
@@ -0,0 +1,148 @@
+using System.Linq;
+using Implab.Automaton.RegularExpressions;
+using System;
+using Implab.Automaton;
+using Implab.Components;
+
+namespace Implab.Formats.Json {
+    public class JsonGrammar : Grammar<char> {
+        public enum TokenType {
+            None,
+            BeginObject,
+            EndObject,
+            BeginArray,
+            EndArray,
+            String,
+            Number,
+            Literal,
+            NameSeparator,
+            ValueSeparator,
+            Whitespace,
+
+            StringBound,
+            EscapedChar,
+            UnescapedChar,
+            EscapedUnicode
+        }
+
+        static LazyAndWeak<JsonGrammar> _instance = new LazyAndWeak<JsonGrammar>(() => new JsonGrammar());
+
+        public static JsonGrammar Instance {
+            get { return _instance.Value; }
+        }
+
+        readonly InputScanner<TokenType> m_jsonExpression;
+        readonly InputScanner<TokenType> m_stringExpression;
+        readonly CharAlphabet m_defaultAlphabet = new CharAlphabet();
+
+        public CharAlphabet DefaultAlphabet { get { return m_defaultAlphabet; } }
+
+        public JsonGrammar() {
+            DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x));
+            var hexDigit = SymbolRangeToken('a','f').Or(SymbolRangeToken('A','F')).Or(SymbolRangeToken('0','9'));
+            var digit9 = SymbolRangeToken('1', '9');
+            var zero = SymbolToken('0');
+            var digit = zero.Or(digit9);
+            var dot = SymbolToken('.');
+            var minus = SymbolToken('-');
+            var sign = SymbolSetToken('-', '+');
+            var expSign = SymbolSetToken('e', 'E');
+            var letters = SymbolRangeToken('a', 'z');
+            var integer = zero.Or(digit9.Cat(digit.EClosure()));
+            var frac = dot.Cat(digit.Closure());
+            var exp = expSign.Cat(sign.Optional()).Cat(digit.Closure());
+            var quote = SymbolToken('"');
+            var backSlash = SymbolToken('\\');
+            var specialEscapeChars = SymbolSetToken('\\', '"', '/', 'b', 'f', 't', 'n', 'r');
+            var unicodeEspace = SymbolToken('u').Cat(hexDigit.Repeat(4));
+            var whitespace = SymbolSetToken('\n', '\r', '\t', ' ').EClosure();
+            var beginObject = whitespace.Cat(SymbolToken('{')).Cat(whitespace);
+            var endObject = whitespace.Cat(SymbolToken('}')).Cat(whitespace);
+            var beginArray = whitespace.Cat(SymbolToken('[')).Cat(whitespace);
+            var endArray = whitespace.Cat(SymbolToken(']')).Cat(whitespace);
+            var nameSep = whitespace.Cat(SymbolToken(':')).Cat(whitespace);
+            var valueSep = whitespace.Cat(SymbolToken(',')).Cat(whitespace);
+            
+            var number = minus.Optional().Cat(integer).Cat(frac.Optional()).Cat(exp.Optional());
+            var literal = letters.Closure();
+            var unescaped = SymbolTokenExcept(Enumerable.Range(0, 0x20).Union(new int[] { '\\', '"' }).Select(x => (char)x));
+
+            var jsonExpression =
+                number.Tag(TokenType.Number)
+                .Or(literal.Tag(TokenType.Literal))
+                .Or(quote.Tag(TokenType.StringBound))
+                .Or(beginObject.Tag(TokenType.BeginObject))
+                .Or(endObject.Tag(TokenType.EndObject))
+                .Or(beginArray.Tag(TokenType.BeginArray))
+                .Or(endArray.Tag(TokenType.EndArray))
+                .Or(nameSep.Tag(TokenType.NameSeparator))
+                .Or(valueSep.Tag(TokenType.ValueSeparator))
+                .Or(SymbolSetToken('\n', '\r', '\t', ' ').Closure().Tag(TokenType.Whitespace));
+
+
+            var jsonStringExpression =
+                quote.Tag(TokenType.StringBound)
+                .Or(backSlash.Cat(specialEscapeChars).Tag(TokenType.EscapedChar))
+                .Or(backSlash.Cat(unicodeEspace).Tag(TokenType.EscapedUnicode))
+                .Or(unescaped.Closure().Tag(TokenType.UnescapedChar));
+                    
+
+            m_jsonExpression = BuildScanner(jsonExpression);
+            m_stringExpression = BuildScanner(jsonStringExpression);
+        }
+
+        public static InputScanner<TokenType> CreateJsonExpressionScanner() {
+            return Instance.m_jsonExpression.Clone();
+        }
+
+        public static InputScanner<TokenType> CreateStringExpressionScanner() {
+            return Instance.m_stringExpression.Clone();
+        }
+
+        protected override IAlphabetBuilder<char> AlphabetBuilder {
+            get {
+                return m_defaultAlphabet;
+            }
+        }
+
+        Token SymbolRangeToken(char start, char stop) {
+            return SymbolToken(Enumerable.Range(start, stop - start + 1).Select(x => (char)x));
+        }
+
+        public InputScanner<TokenType> BuildScanner(Token regexp) {
+            var dfa = new RegularDFA<char, TokenType>(AlphabetBuilder);
+
+            var visitor = new RegularExpressionVisitor<TokenType>(dfa);
+            regexp.Accept(visitor);
+            visitor.BuildDFA();
+
+            if (dfa.IsFinalState(dfa.InitialState))
+                throw new ApplicationException("The specified language contains empty token");
+
+            var ab = new CharAlphabet();
+            var optimal = dfa.Optimize(ab);
+
+            return new InputScanner<TokenType>(
+                optimal.CreateTransitionTable(),
+                optimal.CreateFinalStateTable(),
+                NormalizeTags(optimal.CreateTagTable()),
+                optimal.InitialState,
+                ab.CreateCharMap()
+            );
+        }
+
+        static TokenType[] NormalizeTags(TokenType[][] tags) {
+            var result = new TokenType[tags.Length];
+            for(var i = 0; i< tags.Length; i++) {
+                if (tags[i] == null || tags[i].Length == 0)
+                    result[i] = default(TokenType);
+                else if (tags[i].Length == 1)
+                    result[i] = tags[i][0];
+                else
+                    throw new Exception($"Ambigous state tags {string.Join(", ", tags[i])}");
+            }
+            return result;
+        }
+                
+    }
+}