| 173 | 1 using System; | 
|  | 2 using Implab.Components; | 
| 175 | 3 using Implab.Automaton.RegularExpressions; | 
|  | 4 using System.Diagnostics; | 
|  | 5 using Implab.Automaton; | 
| 176 | 6 using System.IO; | 
|  | 7 using System.Text; | 
| 173 | 8 | 
|  | 9 namespace Implab.Formats { | 
| 176 | 10     public abstract class TextScanner : Disposable { | 
|  | 11         readonly int m_bufferMax; | 
|  | 12         readonly int m_chunkSize; | 
| 173 | 13 | 
| 176 | 14         char[] m_buffer; | 
| 174 | 15         int m_bufferOffset; | 
| 175 | 16         int m_bufferSize; | 
| 176 | 17         int m_tokenOffset; | 
| 173 | 18         int m_tokenLength; | 
| 174 | 19 | 
| 176 | 20         /// <summary> | 
|  | 21         /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class. | 
|  | 22         /// </summary> | 
|  | 23         /// <param name="bufferMax">Buffer max.</param> | 
|  | 24         /// <param name="chunkSize">Chunk size.</param> | 
|  | 25         protected TextScanner(int bufferMax, int chunkSize) { | 
|  | 26             Debug.Assert(m_chunkSize <= m_bufferMax); | 
|  | 27 | 
|  | 28             m_bufferMax = bufferMax; | 
|  | 29             m_chunkSize = chunkSize; | 
|  | 30         } | 
| 173 | 31 | 
| 176 | 32         /// <summary> | 
|  | 33         /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class. | 
|  | 34         /// </summary> | 
|  | 35         /// <param name="buffer">Buffer.</param> | 
|  | 36         protected TextScanner(char[] buffer) { | 
|  | 37             if (buffer != null) { | 
|  | 38                 m_buffer = buffer; | 
|  | 39                 m_bufferSize = buffer.Length; | 
|  | 40             } | 
|  | 41         } | 
|  | 42 | 
|  | 43         /// <summary> | 
|  | 44         /// (hungry) Reads the next token. | 
|  | 45         /// </summary> | 
|  | 46         /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns> | 
|  | 47         /// <param name="dfa">The transition map for the automaton</param> | 
|  | 48         /// <param name="final">Final states of the automaton.</param> | 
|  | 49         /// <param name="tags">Tags.</param> | 
|  | 50         /// <param name="state">The initial state for the automaton.</param> | 
|  | 51         internal bool ReadToken<TTag>(int[,] dfa, int[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) { | 
|  | 52             Safe.ArgumentNotNull(); | 
|  | 53             m_tokenLength = 0; | 
|  | 54 | 
|  | 55             var maxSymbol = alphabet.Length - 1; | 
| 174 | 56 | 
| 175 | 57             do { | 
| 176 | 58                 // after the next chunk is read the offset in the buffer may change | 
|  | 59                 int pos = m_bufferOffset + m_tokenLength; | 
|  | 60 | 
|  | 61                 while(pos < m_bufferSize) { | 
| 175 | 62                     var ch = m_buffer[pos]; | 
| 176 | 63 | 
|  | 64                     state = dfa[state,ch > maxSymbol ? DFAConst.UNCLASSIFIED_INPUT : alphabet[ch]]; | 
| 175 | 65                     if (state == DFAConst.UNREACHABLE_STATE) | 
|  | 66                         break; | 
| 176 | 67 | 
|  | 68                     pos++; | 
| 175 | 69                 } | 
| 176 | 70 | 
|  | 71                 m_tokenLength = pos - m_bufferOffset; | 
|  | 72             } while (state != DFAConst.UNREACHABLE_STATE && Feed()); | 
|  | 73 | 
|  | 74             m_tokenOffset = m_bufferOffset; | 
|  | 75             m_bufferOffset += m_tokenLength; | 
| 174 | 76 | 
| 176 | 77             if (final[state]) { | 
|  | 78                 tag = tags[state]; | 
|  | 79                 return true; | 
|  | 80             } else { | 
|  | 81                 if (m_bufferOffset == m_bufferSize) { | 
|  | 82                     if (m_tokenLength == 0) //EOF | 
|  | 83                         return false; | 
|  | 84 | 
|  | 85                     throw new ParserException(); | 
|  | 86                 } | 
|  | 87                 throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset])); | 
|  | 88 | 
|  | 89             } | 
|  | 90         } | 
| 173 | 91 | 
| 176 | 92         protected void Feed(char[] buffer, int offset, int length) { | 
|  | 93             m_buffer = buffer; | 
|  | 94             m_bufferOffset = offset; | 
|  | 95             m_bufferSize = offset + length; | 
| 173 | 96         } | 
|  | 97 | 
| 176 | 98         protected bool Feed() { | 
|  | 99             if (m_chunkSize <= 0) | 
|  | 100                 return false; | 
|  | 101 | 
|  | 102             if (m_buffer != null) { | 
|  | 103                 var free = m_buffer.Length - m_bufferSize; | 
|  | 104 | 
|  | 105                 if (free < m_chunkSize) { | 
|  | 106                     free += m_chunkSize; | 
|  | 107                     var used = m_bufferSize - m_bufferOffset; | 
|  | 108                     var size = used + free; | 
|  | 109 | 
|  | 110                     if (size > m_bufferMax) | 
|  | 111                         throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached"), m_bufferMax/1024); | 
|  | 112 | 
|  | 113                     var temp = new char[size]; | 
| 175 | 114 | 
| 176 | 115                     var read = Read(temp, used, m_chunkSize); | 
|  | 116                     if (read == 0) | 
|  | 117                         return false; | 
|  | 118 | 
|  | 119                     Array.Copy(m_buffer, m_bufferOffset, temp, 0, used); | 
|  | 120 | 
|  | 121                     m_bufferOffset = 0; | 
|  | 122                     m_bufferSize = used + read; | 
|  | 123                     m_buffer = temp; | 
|  | 124                 } | 
|  | 125             } else { | 
|  | 126                 Debug.Assert(m_bufferOffset == 0); | 
|  | 127                 m_buffer = new char[m_chunkSize]; | 
|  | 128                 m_bufferSize = Read(m_buffer, 0, m_chunkSize); | 
|  | 129                 return (m_bufferSize != 0); | 
|  | 130             } | 
| 175 | 131         } | 
|  | 132 | 
|  | 133         protected abstract int Read(char[] buffer, int offset, int size); | 
| 173 | 134 | 
| 176 | 135         public string GetTokenValue() { | 
|  | 136             return new String(m_buffer, m_tokenOffset, m_tokenLength); | 
| 173 | 137         } | 
|  | 138 | 
| 176 | 139         public void CopyTokenTo(char[] buffer, int offset) { | 
|  | 140             m_buffer.CopyTo(buffer, offset); | 
|  | 141         } | 
|  | 142 | 
|  | 143         public void CopyTokenTo(StringBuilder sb) { | 
|  | 144             sb.Append(m_buffer, m_tokenOffset, m_tokenLength); | 
|  | 145         } | 
| 175 | 146 | 
| 173 | 147     } | 
|  | 148 } | 
|  | 149 |