view Implab/Formats/TextScanner.cs @ 180:c32688129f14 ref20160224

refactoring complete, JSONParser rewritten
author cin
date Thu, 24 Mar 2016 02:30:46 +0300
parents d5c5db0335ee
children b2b6a6640aa3
line wrap: on
line source

using System;
using Implab.Components;
using System.Diagnostics;
using Implab.Automaton;
using System.Text;

namespace Implab.Formats {
    public abstract class TextScanner : Disposable {
        readonly int m_bufferMax;
        readonly int m_chunkSize;

        char[] m_buffer;
        int m_bufferOffset;
        int m_bufferSize;
        int m_tokenOffset;
        int m_tokenLength;

        /// <summary>
        /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner"/> class.
        /// </summary>
        /// <param name="bufferMax">Buffer max.</param>
        /// <param name="chunkSize">Chunk size.</param>
        protected TextScanner(int bufferMax, int chunkSize) {
            Debug.Assert(m_chunkSize <= m_bufferMax);

            m_bufferMax = bufferMax;
            m_chunkSize = chunkSize;
        }

        /// <summary>
        /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner"/> class.
        /// </summary>
        /// <param name="buffer">Buffer.</param>
        protected TextScanner(char[] buffer) {
            if (buffer != null) {
                m_buffer = buffer;
                m_bufferSize = buffer.Length;
            }
        }

        /// <summary>
        /// (hungry) Reads the next token.
        /// </summary>
        /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns>
        /// <param name="dfa">The transition map for the automaton</param>
        /// <param name="final">Final states of the automaton.</param>
        /// <param name="tags">Tags.</param>
        /// <param name="state">The initial state for the automaton.</param>
        /// <param name="alphabet"></param>
        /// <param name = "tag"></param>
        internal bool ReadToken<TTag>(int[,] dfa, bool[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) {
            m_tokenLength = 0;
            tag = null;

            var maxSymbol = alphabet.Length - 1;

            do {
                // after the next chunk is read the offset in the buffer may change
                int pos = m_bufferOffset + m_tokenLength;

                while (pos < m_bufferSize) {
                    var ch = m_buffer[pos];

                    state = dfa[state, ch > maxSymbol ? AutomatonConst.UNCLASSIFIED_INPUT : alphabet[ch]];
                    if (state == AutomatonConst.UNREACHABLE_STATE)
                        break;
                    
                    pos++;
                }

                m_tokenLength = pos - m_bufferOffset;
            } while (state != AutomatonConst.UNREACHABLE_STATE && Feed());

            m_tokenOffset = m_bufferOffset;
            m_bufferOffset += m_tokenLength;

            if (final[state]) {
                tag = tags[state];
                return true;
            }

            if (m_bufferOffset == m_bufferSize) {
                if (m_tokenLength == 0) //EOF
                        return false;
                    
                throw new ParserException();
            }

            throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset]));
            
        }

        protected void Feed(char[] buffer, int offset, int length) {
            m_buffer = buffer;
            m_bufferOffset = offset;
            m_bufferSize = offset + length;
        }

        protected bool Feed() {
            if (m_chunkSize <= 0)
                return false;
            
            if (m_buffer != null) {
                var free = m_buffer.Length - m_bufferSize;

                if (free < m_chunkSize) {
                    free += m_chunkSize;
                    var used = m_bufferSize - m_bufferOffset;
                    var size = used + free;

                    if (size > m_bufferMax)
                        throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached", m_bufferMax / 1024));
                    
                    var temp = new char[size];

                    var read = Read(temp, used, m_chunkSize);
                    if (read == 0)
                        return false;

                    Array.Copy(m_buffer, m_bufferOffset, temp, 0, used);

                    m_bufferOffset = 0;
                    m_bufferSize = used + read;
                    m_buffer = temp;
                } else {
                    var read = Read(m_buffer, m_bufferSize, m_chunkSize);
                    if (read == 0)
                        return false;
                    m_bufferSize += m_chunkSize;
                }
                return true;
            } else {
                Debug.Assert(m_bufferOffset == 0);
                m_buffer = new char[m_chunkSize];
                m_bufferSize = Read(m_buffer, 0, m_chunkSize);
                return (m_bufferSize != 0);
            }
        }

        protected abstract int Read(char[] buffer, int offset, int size);

        public string GetTokenValue() {
            return new String(m_buffer, m_tokenOffset, m_tokenLength);
        }

        public void CopyTokenTo(char[] buffer, int offset) {
            m_buffer.CopyTo(buffer, offset);
        }

        public void CopyTokenTo(StringBuilder sb) {
            sb.Append(m_buffer, m_tokenOffset, m_tokenLength);
        }
         
    }
}