changeset 173:ecfece82ca11 ref20160224

Working on text scanner
author cin
date Tue, 15 Mar 2016 02:11:06 +0300 (2016-03-14)
parents 92d5278d1b10
children 983df35b3ca1
files Implab/Formats/BufferScanner.cs Implab/Formats/TextScanner.cs Implab/Implab.csproj
diffstat 3 files changed, 217 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Implab/Formats/BufferScanner.cs	Tue Mar 15 02:11:06 2016 +0300
@@ -0,0 +1,143 @@
+using System;
+using Implab.Automaton.RegularExpressions;
+using Implab.Automaton;
+
+namespace Implab.Formats {
+    public struct BufferScanner<TTag> {
+        char[] m_buffer;
+        int m_offset;
+        int m_position;
+        int m_hi;
+
+        readonly int m_chunk;
+        readonly int m_limit;
+
+        readonly DFAStateDescriptor<TTag>[] m_dfa;
+        int m_state;
+
+        public BufferScanner(DFAStateDescriptor<TTag>[] dfa, int initialState, int chunk, int limit) {
+            m_dfa = dfa;
+            m_state = initialState;
+            m_chunk = chunk;
+            m_limit = limit;
+            m_buffer = null;
+            m_offset = 0;
+            m_position = 0;
+            m_hi = 0;
+        }
+
+        public char[] Buffer {
+            get {
+                return m_buffer;
+            }
+        }
+
+        public int HiMark {
+            get {
+                return m_hi;
+            }
+        }
+
+        public int Position {
+            get {
+                return m_position;
+            }
+        }
+
+        public int Length {
+            get {
+                return m_hi - m_position;
+            }
+        }
+
+        public int TokenOffset {
+            get {
+                return m_offset;
+            }
+        }
+
+        public int TokenLength {
+            get {
+                return m_position - m_offset;
+            }
+        }
+
+        public void Init(char[] buffer, int position, int length) {
+            m_buffer = buffer;
+            m_position = position;
+            m_offset = position;
+            m_hi = position + length;
+        }
+
+        public int Extend() {
+            // free space
+            var free = m_buffer.Length - m_hi;
+
+            // if the buffer have enough free space
+            if (free > 0)
+                return free;
+
+            // effective size of the buffer
+            var size = m_buffer.Length - m_offset;
+                
+            // calculate the new size
+            int grow = Math.Min(m_limit - size, m_chunk);
+            if (grow <= 0)
+                throw new ParserException(String.Format("Input buffer {0} bytes limit exceeded", m_limit));
+
+            var temp = new char[size + grow];
+            Array.Copy(m_buffer, m_offset, temp, 0, m_hi - m_offset);
+            m_position -= m_offset;
+            m_hi -= m_offset;
+            m_offset = 0;
+            m_buffer = temp;
+
+            return free + grow;
+        }
+
+        public void RaiseMark(int size) {
+            m_hi += size;
+        }
+
+        /// <summary>
+        /// Scan this instance.
+        /// </summary>
+        /// <returns><c>true</c> - additional data required</returns>
+        public bool Scan() {
+            while (m_position < m_hi) {
+                var ch = m_buffer[m_position];
+                var next = m_dfa[m_state].transitions[(int)ch];
+                if (next == DFAConst.UNREACHABLE_STATE) {
+                    if (m_dfa[m_state].final)
+                        return false;
+
+                    throw new ParserException(
+                        String.Format(
+                            "Unexpected token '{0}'",
+                            new string(m_buffer, m_offset, m_position - m_offset)
+                        )
+                    );
+                }
+                m_state = next;
+                m_position++;
+            }
+
+            return true;
+        }
+
+        public void Eof() {
+            if (!m_dfa[m_state].final)
+                throw new ParserException(
+                    String.Format(
+                        "Unexpected token '{0}'",
+                        new string(m_buffer, m_offset, m_position - m_offset)
+                    )
+                );
+        }
+
+        public TTag[] GetTokenTags() {
+            return m_dfa[m_state].tags;
+        }
+    }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Implab/Formats/TextScanner.cs	Tue Mar 15 02:11:06 2016 +0300
@@ -0,0 +1,72 @@
+using System;
+using Implab.Components;
+
+namespace Implab.Formats {
+    public abstract class TextScanner<TTag> : Disposable {
+
+        char[] m_buffer;
+        int m_offset;
+        int m_length;
+        int m_tokenOffset;
+        int m_tokenLength;
+        TTag[] m_tags;
+
+        BufferScanner<TTag> m_scanner;
+
+        protected bool ReadTokenInternal() {
+            if (EOF)
+                return false;
+            
+            // create a new scanner from template (scanners are structs)
+            var inst = m_scanner;
+
+            // initialize the scanner
+            inst.Init(m_buffer, m_offset, m_length);
+
+            // do work
+            while (inst.Scan())
+                Feed(ref inst);
+
+            // save result;
+            m_buffer = inst.Buffer;
+            m_length = inst.Length;
+            m_offset = inst.Position;
+            m_tokenOffset = inst.TokenOffset;
+            m_tokenLength = inst.TokenLength;
+
+            m_tags = inst.GetTokenTags();
+        }
+
+        protected string GetToken() {
+            return new String(m_buffer, m_tokenOffset, m_tokenLength);
+        }
+
+        protected TTag[] Tags {
+            get {
+                return m_tags; 
+            }
+        }
+
+        /// <summary>
+        /// Feed the specified scanner.
+        /// </summary>
+        /// <param name="scanner">Scanner.</param>
+        /// <example>
+        /// protected override void Feed(ref BufferScanner<TTag> scanner) {
+        ///     var size = scanner.Extend();
+        ///     var actual = m_reader.Read(scanner.Buffer, scanner.HiMark, size);
+        ///     if (actual == 0) {
+        ///         m_eof = true;
+        ///         scanner.Eof();
+        ///     } else {
+        ///         scanner.RaiseHiMark(actual);
+        ///     }
+        /// }
+        /// </example>
+        protected abstract void Feed(ref BufferScanner<TTag> scanner);
+
+        public abstract bool EOF { get; }
+
+    }
+}
+
--- a/Implab/Implab.csproj	Mon Mar 14 01:19:38 2016 +0300
+++ b/Implab/Implab.csproj	Tue Mar 15 02:11:06 2016 +0300
@@ -191,6 +191,8 @@
     <Compile Include="Automaton\RegularExpressions\RegularExpressionVisitor.cs" />
     <Compile Include="Automaton\RegularExpressions\ITaggedDFABuilder.cs" />
     <Compile Include="Automaton\RegularExpressions\DFAStateDescriptorT.cs" />
+    <Compile Include="Formats\BufferScanner.cs" />
+    <Compile Include="Formats\TextScanner.cs" />
   </ItemGroup>
   <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
   <ItemGroup />