diff Implab/Formats/TextScanner.cs @ 176:0c3c69fe225b ref20160224

rewritten the text scanner
author cin
date Tue, 22 Mar 2016 18:58:40 +0300
parents 96a89dcb4060
children a0ff6a0e9c44
line wrap: on
line diff
--- a/Implab/Formats/TextScanner.cs	Mon Mar 21 18:41:45 2016 +0300
+++ b/Implab/Formats/TextScanner.cs	Tue Mar 22 18:58:40 2016 +0300
@@ -3,50 +3,146 @@
 using Implab.Automaton.RegularExpressions;
 using System.Diagnostics;
 using Implab.Automaton;
+using System.IO;
+using System.Text;
 
 namespace Implab.Formats {
-    public abstract class TextScanner<TTag> : Disposable {
+    public abstract class TextScanner : Disposable {
+        readonly int m_bufferMax;
+        readonly int m_chunkSize;
 
-        int m_maxSymbol;
-        int[] m_symbolMap;
-
-        readonly char[] m_buffer;
+        char[] m_buffer;
         int m_bufferOffset;
         int m_bufferSize;
+        int m_tokenOffset;
         int m_tokenLength;
 
-        TTag[] m_tags;
+        /// <summary>
+        /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class.
+        /// </summary>
+        /// <param name="bufferMax">Buffer max.</param>
+        /// <param name="chunkSize">Chunk size.</param>
+        protected TextScanner(int bufferMax, int chunkSize) {
+            Debug.Assert(m_chunkSize <= m_bufferMax);
+
+            m_bufferMax = bufferMax;
+            m_chunkSize = chunkSize;
+        }
 
-        protected bool ReadTokenInternal(DFAStateDescriptor<TTag>[] dfa, int state) {
-            Debug.Assert(dfa != null);
+        /// <summary>
+        /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class.
+        /// </summary>
+        /// <param name="buffer">Buffer.</param>
+        protected TextScanner(char[] buffer) {
+            if (buffer != null) {
+                m_buffer = buffer;
+                m_bufferSize = buffer.Length;
+            }
+        }
+
+        /// <summary>
+        /// (hungry) Reads the next token.
+        /// </summary>
+        /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns>
+        /// <param name="dfa">The transition map for the automaton</param>
+        /// <param name="final">Final states of the automaton.</param>
+        /// <param name="tags">Tags.</param>
+        /// <param name="state">The initial state for the automaton.</param>
+        internal bool ReadToken<TTag>(int[,] dfa, int[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) {
+            Safe.ArgumentNotNull();
+            m_tokenLength = 0;
+
+            var maxSymbol = alphabet.Length - 1;
 
             do {
-                for (var pos = m_bufferOffset; pos < m_bufferSize; pos++) {
+                // after the next chunk is read the offset in the buffer may change
+                int pos = m_bufferOffset + m_tokenLength;
+
+                while(pos < m_bufferSize) {
                     var ch = m_buffer[pos];
-                    state = dfa[state].transitions[m_symbolMap[ch > m_maxSymbol ? m_maxSymbol : ch]];
+
+                    state = dfa[state,ch > maxSymbol ? DFAConst.UNCLASSIFIED_INPUT : alphabet[ch]];
                     if (state == DFAConst.UNREACHABLE_STATE)
                         break;
+                    
+                    pos++;
                 }
-            } while (Feed());
+
+                m_tokenLength = pos - m_bufferOffset;
+            } while (state != DFAConst.UNREACHABLE_STATE && Feed());
+
+            m_tokenOffset = m_bufferOffset;
+            m_bufferOffset += m_tokenLength;
 
-            if (dfa[state].final) {
+            if (final[state]) {
+                tag = tags[state];
+                return true;
+            } else {
+                if (m_bufferOffset == m_bufferSize) {
+                    if (m_tokenLength == 0) //EOF
+                        return false;
+                    
+                    throw new ParserException();
+                }
+                throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset]));
+            
+            }
+        }
 
-            }
-
+        protected void Feed(char[] buffer, int offset, int length) {
+            m_buffer = buffer;
+            m_bufferOffset = offset;
+            m_bufferSize = offset + length;
         }
 
-        bool Feed() {
+        protected bool Feed() {
+            if (m_chunkSize <= 0)
+                return false;
+            
+            if (m_buffer != null) {
+                var free = m_buffer.Length - m_bufferSize;
+
+                if (free < m_chunkSize) {
+                    free += m_chunkSize;
+                    var used = m_bufferSize - m_bufferOffset;
+                    var size = used + free;
+
+                    if (size > m_bufferMax)
+                        throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached"), m_bufferMax/1024);
+                    
+                    var temp = new char[size];
 
+                    var read = Read(temp, used, m_chunkSize);
+                    if (read == 0)
+                        return false;
+
+                    Array.Copy(m_buffer, m_bufferOffset, temp, 0, used);
+
+                    m_bufferOffset = 0;
+                    m_bufferSize = used + read;
+                    m_buffer = temp;
+                }
+            } else {
+                Debug.Assert(m_bufferOffset == 0);
+                m_buffer = new char[m_chunkSize];
+                m_bufferSize = Read(m_buffer, 0, m_chunkSize);
+                return (m_bufferSize != 0);
+            }
         }
 
         protected abstract int Read(char[] buffer, int offset, int size);
 
-        protected TTag[] Tags {
-            get {
-                return m_tags; 
-            }
+        public string GetTokenValue() {
+            return new String(m_buffer, m_tokenOffset, m_tokenLength);
         }
 
+        public void CopyTokenTo(char[] buffer, int offset) {
+            m_buffer.CopyTo(buffer, offset);
+        }
+
+        public void CopyTokenTo(StringBuilder sb) {
+            sb.Append(m_buffer, m_tokenOffset, m_tokenLength);
+        }
          
     }
 }