view Implab/Formats/Json/JsonScanner.cs @ 236:302ca905c19e v2

JsonReader optimizations
author cin
date Tue, 21 Nov 2017 14:57:58 +0300
parents 3e26338eb977
children
line wrap: on
line source

using System;
using System.Globalization;
using Implab.Automaton;
using System.Text;
using Implab.Components;
using System.IO;

namespace Implab.Formats.Json {
    /// <summary>
    /// Сканнер (лексер), разбивающий поток символов на токены JSON.
    /// </summary>
    public abstract class JsonScanner : Disposable {
        readonly FastInputScanner<JsonGrammar.TokenType> m_jsonContext = JsonGrammar.CreateJsonExpressionScanner();
        readonly FastInputScanner<JsonGrammar.TokenType> m_stringContext = JsonGrammar.CreateStringExpressionScanner();

        readonly char[] m_unescapeBuf = new char[4];
        readonly char[] m_buffer;
        int m_length;
        int m_pos;
        readonly StringBuilder m_tokenBuilder = new StringBuilder();

        protected JsonScanner(char[] buffer, int pos, int length) {
            m_buffer = buffer;
            m_pos = pos;
            m_length = length;
        }

        bool ReadChunk(FastInputScanner<JsonGrammar.TokenType> scanner, out JsonGrammar.TokenType tokenType) {
            scanner.ResetState();

            while(scanner.Scan(m_buffer, m_pos, m_length)) {
                // scanner requests new data

                if (m_pos != m_length) // capture results for the future
                    m_tokenBuilder.Append(m_buffer, m_pos, m_length - m_pos);
                
                // read next data
                m_length = Read(m_buffer, 0, m_buffer.Length);

                if (m_length == 0) {
                    // no data is read
                    if (scanner.Position == m_pos) {
                        // scanned hasn't moved, that's the end
                        m_pos = 0;
                        tokenType = JsonGrammar.TokenType.None;
                        return false;
                    }

                    if (scanner.IsFinal) {
                        m_pos = 0;
                        tokenType = scanner.Tag;
                        return true;
                    } else {
                        throw new ParserException("Unexpected EOF");
                    }
                }

                m_pos = 0;
            }
            var scannerPos = scanner.Position;

            // scanner stops as scannerPos
            if (!scanner.IsFinal)
                throw new ParserException($"Unexpected character '{m_buffer[scannerPos + 1]}'");

            tokenType = scanner.Tag;
            if (scannerPos != m_pos && tokenType == JsonGrammar.TokenType.Number || tokenType == JsonGrammar.TokenType.Literal)
                m_tokenBuilder.Append(m_buffer, m_pos, scannerPos - m_pos);
            
            m_pos = scannerPos;
            return true;
        }

        bool ReadStringChunk(FastInputScanner<JsonGrammar.TokenType> scanner, out JsonGrammar.TokenType tokenType) {
            scanner.ResetState();

            while (scanner.Scan(m_buffer, m_pos, m_length)) {
                // scanner requests new data

                if (m_pos != m_length) // capture results for the future
                    m_tokenBuilder.Append(m_buffer, m_pos, m_length - m_pos);

                // read next data
                m_length = Read(m_buffer, 0, m_buffer.Length);

                if (m_length == 0) {
                    // no data is read
                    if (scanner.Position == m_pos) {
                        // scanned hasn't moved, that's the end
                        m_pos = 0;
                        tokenType = JsonGrammar.TokenType.None;
                        return false;
                    }

                    if (scanner.IsFinal) {
                        m_pos = 0;
                        tokenType = scanner.Tag;
                        return true;
                    } else {
                        throw new ParserException("Unexpected EOF");
                    }
                }

                m_pos = 0;
            }
            var scannerPos = scanner.Position;

            // scanner stops as scannerPos
            if (!scanner.IsFinal)
                throw new ParserException($"Unexpected character '{m_buffer[scannerPos]}'");

            if (scannerPos != m_pos) {
                m_tokenBuilder.Append(m_buffer, m_pos, scannerPos - m_pos);
                m_pos = scannerPos;
            }
            tokenType = scanner.Tag;
            return true;
        }

        protected abstract int Read(char[] buffer, int offset, int size);


        /// <summary>
        /// Читает следующий лексический элемент из входных данных.
        /// </summary>
        /// <param name="tokenValue">Возвращает значение прочитанного токена.</param>
        /// <param name="tokenType">Возвращает тип прочитанного токена.</param>
        /// <returns><c>true</c> - чтение произведено успешно. <c>false</c> - достигнут конец входных данных</returns>
        /// <remarks>В случе если токен не распознается, возникает исключение. Значения токенов обрабатываются, т.е.
        /// в строках обрабатываются экранированные символы, числа становтся типа double.</remarks>
        public bool ReadToken(out string tokenValue, out JsonTokenType tokenType) {
            JsonGrammar.TokenType tag;
            m_tokenBuilder.Clear();
            while (ReadChunk(m_jsonContext, out tag)) {
                switch (tag) {
                    case JsonGrammar.TokenType.StringBound:
                        tokenValue = ReadString();
                        tokenType = JsonTokenType.String;
                        break;
                    case JsonGrammar.TokenType.Number:
                        tokenValue = m_tokenBuilder.ToString();
                        tokenType = JsonTokenType.Number;
                        break;
                    case JsonGrammar.TokenType.Literal:
                        tokenType = JsonTokenType.Literal;
                        tokenValue = m_tokenBuilder.ToString();
                        break;
                    case JsonGrammar.TokenType.Whitespace:
                        m_tokenBuilder.Clear();
                        continue;
                    default:
                        tokenType = (JsonTokenType)tag;
                        tokenValue = null;
                        break;
                }
                return true;
            }
            tokenValue = null;
            tokenType = JsonTokenType.None;
            return false;
        }

        string ReadString() {
            JsonGrammar.TokenType tag;
            m_tokenBuilder.Clear();

            while (ReadStringChunk(m_stringContext, out tag)) {
                switch (tag) {
                    case JsonGrammar.TokenType.StringBound:
                        m_tokenBuilder.Length--;
                        return m_tokenBuilder.ToString();
                    case JsonGrammar.TokenType.UnescapedChar:
                        break;
                    case JsonGrammar.TokenType.EscapedUnicode: // \xXXXX - unicode escape sequence
                        m_tokenBuilder.CopyTo(m_tokenBuilder.Length - 4, m_unescapeBuf, 0, 4);
                        m_tokenBuilder.Length -= 6;
                        m_tokenBuilder.Append(StringTranslator.TranslateHexUnicode(m_unescapeBuf, 0));
                        break;
                    case JsonGrammar.TokenType.EscapedChar:  // \t - escape sequence
                        var ch = m_tokenBuilder[m_tokenBuilder.Length-1];
                        m_tokenBuilder.Length -= 2;
                        m_tokenBuilder.Append(StringTranslator.TranslateEscapedChar(ch));
                        break;
                }
            }

            throw new ParserException("Unexpected end of data");
        }
    }
}