Mercurial > pub > ImplabNet
diff Implab/Formats/TextScanner.cs @ 176:0c3c69fe225b ref20160224
rewritten the text scanner
author | cin |
---|---|
date | Tue, 22 Mar 2016 18:58:40 +0300 |
parents | 96a89dcb4060 |
children | a0ff6a0e9c44 |
line wrap: on
line diff
--- a/Implab/Formats/TextScanner.cs Mon Mar 21 18:41:45 2016 +0300 +++ b/Implab/Formats/TextScanner.cs Tue Mar 22 18:58:40 2016 +0300 @@ -3,50 +3,146 @@ using Implab.Automaton.RegularExpressions; using System.Diagnostics; using Implab.Automaton; +using System.IO; +using System.Text; namespace Implab.Formats { - public abstract class TextScanner<TTag> : Disposable { + public abstract class TextScanner : Disposable { + readonly int m_bufferMax; + readonly int m_chunkSize; - int m_maxSymbol; - int[] m_symbolMap; - - readonly char[] m_buffer; + char[] m_buffer; int m_bufferOffset; int m_bufferSize; + int m_tokenOffset; int m_tokenLength; - TTag[] m_tags; + /// <summary> + /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class. + /// </summary> + /// <param name="bufferMax">Buffer max.</param> + /// <param name="chunkSize">Chunk size.</param> + protected TextScanner(int bufferMax, int chunkSize) { + Debug.Assert(m_chunkSize <= m_bufferMax); + + m_bufferMax = bufferMax; + m_chunkSize = chunkSize; + } - protected bool ReadTokenInternal(DFAStateDescriptor<TTag>[] dfa, int state) { - Debug.Assert(dfa != null); + /// <summary> + /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class. + /// </summary> + /// <param name="buffer">Buffer.</param> + protected TextScanner(char[] buffer) { + if (buffer != null) { + m_buffer = buffer; + m_bufferSize = buffer.Length; + } + } + + /// <summary> + /// (hungry) Reads the next token. + /// </summary> + /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns> + /// <param name="dfa">The transition map for the automaton</param> + /// <param name="final">Final states of the automaton.</param> + /// <param name="tags">Tags.</param> + /// <param name="state">The initial state for the automaton.</param> + internal bool ReadToken<TTag>(int[,] dfa, int[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) { + Safe.ArgumentNotNull(); + m_tokenLength = 0; + + var maxSymbol = alphabet.Length - 1; do { - for (var pos = m_bufferOffset; pos < m_bufferSize; pos++) { + // after the next chunk is read the offset in the buffer may change + int pos = m_bufferOffset + m_tokenLength; + + while(pos < m_bufferSize) { var ch = m_buffer[pos]; - state = dfa[state].transitions[m_symbolMap[ch > m_maxSymbol ? m_maxSymbol : ch]]; + + state = dfa[state,ch > maxSymbol ? DFAConst.UNCLASSIFIED_INPUT : alphabet[ch]]; if (state == DFAConst.UNREACHABLE_STATE) break; + + pos++; } - } while (Feed()); + + m_tokenLength = pos - m_bufferOffset; + } while (state != DFAConst.UNREACHABLE_STATE && Feed()); + + m_tokenOffset = m_bufferOffset; + m_bufferOffset += m_tokenLength; - if (dfa[state].final) { + if (final[state]) { + tag = tags[state]; + return true; + } else { + if (m_bufferOffset == m_bufferSize) { + if (m_tokenLength == 0) //EOF + return false; + + throw new ParserException(); + } + throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset])); + + } + } - } - + protected void Feed(char[] buffer, int offset, int length) { + m_buffer = buffer; + m_bufferOffset = offset; + m_bufferSize = offset + length; } - bool Feed() { + protected bool Feed() { + if (m_chunkSize <= 0) + return false; + + if (m_buffer != null) { + var free = m_buffer.Length - m_bufferSize; + + if (free < m_chunkSize) { + free += m_chunkSize; + var used = m_bufferSize - m_bufferOffset; + var size = used + free; + + if (size > m_bufferMax) + throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached"), m_bufferMax/1024); + + var temp = new char[size]; + var read = Read(temp, used, m_chunkSize); + if (read == 0) + return false; + + Array.Copy(m_buffer, m_bufferOffset, temp, 0, used); + + m_bufferOffset = 0; + m_bufferSize = used + read; + m_buffer = temp; + } + } else { + Debug.Assert(m_bufferOffset == 0); + m_buffer = new char[m_chunkSize]; + m_bufferSize = Read(m_buffer, 0, m_chunkSize); + return (m_bufferSize != 0); + } } protected abstract int Read(char[] buffer, int offset, int size); - protected TTag[] Tags { - get { - return m_tags; - } + public string GetTokenValue() { + return new String(m_buffer, m_tokenOffset, m_tokenLength); } + public void CopyTokenTo(char[] buffer, int offset) { + m_buffer.CopyTo(buffer, offset); + } + + public void CopyTokenTo(StringBuilder sb) { + sb.Append(m_buffer, m_tokenOffset, m_tokenLength); + } } }