view Implab/Formats/JSON/JsonGrammar.cs @ 234:8dd666e6b6bf v2

Added implab nuget spec
author cin
date Thu, 05 Oct 2017 09:21:23 +0300
parents 6fa235c5a760
children
line wrap: on
line source

using System.Linq;
using Implab.Automaton.RegularExpressions;
using System;
using Implab.Automaton;
using Implab.Components;

namespace Implab.Formats.Json {
    public class JsonGrammar : Grammar<char> {
        public enum TokenType {
            None,
            BeginObject,
            EndObject,
            BeginArray,
            EndArray,
            String,
            Number,
            Literal,
            NameSeparator,
            ValueSeparator,
            Whitespace,

            StringBound,
            EscapedChar,
            UnescapedChar,
            EscapedUnicode
        }

        static LazyAndWeak<JsonGrammar> _instance = new LazyAndWeak<JsonGrammar>(() => new JsonGrammar());

        public static JsonGrammar Instance {
            get { return _instance.Value; }
        }

        readonly InputScanner<TokenType> m_jsonExpression;
        readonly InputScanner<TokenType> m_stringExpression;
        readonly CharAlphabet m_defaultAlphabet = new CharAlphabet();

        public CharAlphabet DefaultAlphabet { get { return m_defaultAlphabet; } }

        public JsonGrammar() {
            DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x));
            var hexDigit = SymbolRangeToken('a','f').Or(SymbolRangeToken('A','F')).Or(SymbolRangeToken('0','9'));
            var digit9 = SymbolRangeToken('1', '9');
            var zero = SymbolToken('0');
            var digit = zero.Or(digit9);
            var dot = SymbolToken('.');
            var minus = SymbolToken('-');
            var sign = SymbolSetToken('-', '+');
            var expSign = SymbolSetToken('e', 'E');
            var letters = SymbolRangeToken('a', 'z');
            var integer = zero.Or(digit9.Cat(digit.EClosure()));
            var frac = dot.Cat(digit.Closure());
            var exp = expSign.Cat(sign.Optional()).Cat(digit.Closure());
            var quote = SymbolToken('"');
            var backSlash = SymbolToken('\\');
            var specialEscapeChars = SymbolSetToken('\\', '"', '/', 'b', 'f', 't', 'n', 'r');
            var unicodeEspace = SymbolToken('u').Cat(hexDigit.Repeat(4));
            var whitespace = SymbolSetToken('\n', '\r', '\t', ' ').EClosure();
            var beginObject = whitespace.Cat(SymbolToken('{')).Cat(whitespace);
            var endObject = whitespace.Cat(SymbolToken('}')).Cat(whitespace);
            var beginArray = whitespace.Cat(SymbolToken('[')).Cat(whitespace);
            var endArray = whitespace.Cat(SymbolToken(']')).Cat(whitespace);
            var nameSep = whitespace.Cat(SymbolToken(':')).Cat(whitespace);
            var valueSep = whitespace.Cat(SymbolToken(',')).Cat(whitespace);
            
            var number = minus.Optional().Cat(integer).Cat(frac.Optional()).Cat(exp.Optional());
            var literal = letters.Closure();
            var unescaped = SymbolTokenExcept(Enumerable.Range(0, 0x20).Union(new int[] { '\\', '"' }).Select(x => (char)x));

            var jsonExpression =
                number.Tag(TokenType.Number)
                .Or(literal.Tag(TokenType.Literal))
                .Or(quote.Tag(TokenType.StringBound))
                .Or(beginObject.Tag(TokenType.BeginObject))
                .Or(endObject.Tag(TokenType.EndObject))
                .Or(beginArray.Tag(TokenType.BeginArray))
                .Or(endArray.Tag(TokenType.EndArray))
                .Or(nameSep.Tag(TokenType.NameSeparator))
                .Or(valueSep.Tag(TokenType.ValueSeparator))
                .Or(SymbolSetToken('\n', '\r', '\t', ' ').Closure().Tag(TokenType.Whitespace));


            var jsonStringExpression =
                quote.Tag(TokenType.StringBound)
                .Or(backSlash.Cat(specialEscapeChars).Tag(TokenType.EscapedChar))
                .Or(backSlash.Cat(unicodeEspace).Tag(TokenType.EscapedUnicode))
                .Or(unescaped.Closure().Tag(TokenType.UnescapedChar));
                    

            m_jsonExpression = BuildScanner(jsonExpression);
            m_stringExpression = BuildScanner(jsonStringExpression);
        }

        public static InputScanner<TokenType> CreateJsonExpressionScanner() {
            return Instance.m_jsonExpression.Clone();
        }

        public static InputScanner<TokenType> CreateStringExpressionScanner() {
            return Instance.m_stringExpression.Clone();
        }

        protected override IAlphabetBuilder<char> AlphabetBuilder {
            get {
                return m_defaultAlphabet;
            }
        }

        Token SymbolRangeToken(char start, char stop) {
            return SymbolToken(Enumerable.Range(start, stop - start + 1).Select(x => (char)x));
        }

        public InputScanner<TokenType> BuildScanner(Token regexp) {
            var dfa = new RegularDFA<char, TokenType>(AlphabetBuilder);

            var visitor = new RegularExpressionVisitor<TokenType>(dfa);
            regexp.Accept(visitor);
            visitor.BuildDFA();

            if (dfa.IsFinalState(dfa.InitialState))
                throw new ApplicationException("The specified language contains empty token");

            var ab = new CharAlphabet();
            var optimal = dfa.Optimize(ab);

            return new InputScanner<TokenType>(
                optimal.CreateTransitionTable(),
                optimal.CreateFinalStateTable(),
                NormalizeTags(optimal.CreateTagTable()),
                optimal.InitialState,
                ab.CreateCharMap()
            );
        }

        static TokenType[] NormalizeTags(TokenType[][] tags) {
            var result = new TokenType[tags.Length];
            for(var i = 0; i< tags.Length; i++) {
                if (tags[i] == null || tags[i].Length == 0)
                    result[i] = default(TokenType);
                else if (tags[i].Length == 1)
                    result[i] = tags[i][0];
                else
                    throw new Exception($"Ambigous state tags {string.Join(", ", tags[i])}");
            }
            return result;
        }
                
    }
}