view Implab/Formats/TextScanner.cs @ 209:a867536c68fc v2

Bound promise to CancellationToken Added new states to ExecutionSate enum. Added Safe.Guard() method to handle cleanup of the result of the promise
author cin
date Wed, 16 Nov 2016 03:06:08 +0300
parents 76e8f2ba12b8
children
line wrap: on
line source

using System;
using Implab.Components;
using System.Diagnostics;
using Implab.Automaton;
using System.Text;

namespace Implab.Formats {
    public abstract class TextScanner : Disposable {
        readonly int m_bufferMax;
        readonly int m_chunkSize;

        char[] m_buffer;
        int m_bufferOffset;
        int m_bufferSize;
        int m_tokenOffset;
        int m_tokenLength;

        /// <summary>
        /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner"/> class.
        /// </summary>
        /// <param name="bufferMax">Buffer max.</param>
        /// <param name="chunkSize">Chunk size.</param>
        protected TextScanner(int bufferMax, int chunkSize) {
            Debug.Assert(m_chunkSize <= m_bufferMax);

            m_bufferMax = bufferMax;
            m_chunkSize = chunkSize;
        }

        /// <summary>
        /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner"/> class.
        /// </summary>
        /// <param name="buffer">Buffer.</param>
        protected TextScanner(char[] buffer) {
            if (buffer != null) {
                m_buffer = buffer;
                m_bufferSize = buffer.Length;
            }
        }

        /// <summary>
        /// (hungry) Reads the next token.
        /// </summary>
        /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns>
        /// <param name="dfa">The transition map for the automaton</param>
        /// <param name="final">Final states of the automaton.</param>
        /// <param name="tags">Tags.</param>
        /// <param name="state">The initial state for the automaton.</param>
        /// <param name="alphabet"></param>
        /// <param name = "tag"></param>
        internal bool ReadToken<TTag>(int[,] dfa, bool[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) {
            m_tokenLength = 0;
            tag = null;

            var maxSymbol = alphabet.Length - 1;
            int next;
            do {
                // after the next chunk is read the offset in the buffer may change
                int pos = m_bufferOffset + m_tokenLength;
                next = state;
                while (pos < m_bufferSize) {
                    var ch = m_buffer[pos];

                    next = dfa[next, ch > maxSymbol ? AutomatonConst.UNCLASSIFIED_INPUT : alphabet[ch]];
                    
                    if (next == AutomatonConst.UNREACHABLE_STATE)
                        break;
                 
                    state = next;
                    pos++;
                }
                m_tokenLength = pos - m_bufferOffset;
            } while (next != AutomatonConst.UNREACHABLE_STATE && Feed());

            m_tokenOffset = m_bufferOffset;
            m_bufferOffset += m_tokenLength;

            if (final[state]) {
                tag = tags[state];
                return true;
            }

            if (m_bufferOffset == m_bufferSize) {
                if (m_tokenLength == 0) //EOF
                        return false;
                    
                throw new ParserException();
            }

            throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset]));
            
        }

        protected void Feed(char[] buffer, int offset, int length) {
            m_buffer = buffer;
            m_bufferOffset = offset;
            m_bufferSize = offset + length;
        }

        protected bool Feed() {
            if (m_chunkSize <= 0)
                return false;
            
            if (m_buffer != null) {
                var free = m_buffer.Length - m_bufferSize;

                if (free < m_chunkSize) {
                    free += m_chunkSize;
                    var used = m_bufferSize - m_bufferOffset;
                    var size = used + free;

                    if (size > m_bufferMax)
                        throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached", m_bufferMax / 1024));
                    
                    var temp = new char[size];

                    var read = Read(temp, used, m_chunkSize);
                    if (read == 0)
                        return false;

                    Array.Copy(m_buffer, m_bufferOffset, temp, 0, used);

                    m_bufferOffset = 0;
                    m_bufferSize = used + read;
                    m_buffer = temp;
                } else {
                    var read = Read(m_buffer, m_bufferSize, m_chunkSize);
                    if (read == 0)
                        return false;
                    m_bufferSize += m_chunkSize;
                }
                return true;
            } else {
                Debug.Assert(m_bufferOffset == 0);
                m_buffer = new char[m_chunkSize];
                m_bufferSize = Read(m_buffer, 0, m_chunkSize);
                return (m_bufferSize != 0);
            }
        }

        protected abstract int Read(char[] buffer, int offset, int size);

        public string GetTokenValue() {
            return new String(m_buffer, m_tokenOffset, m_tokenLength);
        }

        public void CopyTokenTo(char[] buffer, int offset) {
            Array.Copy(m_buffer, m_tokenOffset,buffer, offset, m_tokenLength);
        }

        public void CopyTokenTo(StringBuilder sb) {
            sb.Append(m_buffer, m_tokenOffset, m_tokenLength);
        }
         
    }
}