Mercurial > pub > ImplabNet
comparison Implab/Formats/TextScanner.cs @ 176:0c3c69fe225b ref20160224
rewritten the text scanner
| author | cin |
|---|---|
| date | Tue, 22 Mar 2016 18:58:40 +0300 |
| parents | 96a89dcb4060 |
| children | a0ff6a0e9c44 |
comparison
equal
deleted
inserted
replaced
| 175:96a89dcb4060 | 176:0c3c69fe225b |
|---|---|
| 1 using System; | 1 using System; |
| 2 using Implab.Components; | 2 using Implab.Components; |
| 3 using Implab.Automaton.RegularExpressions; | 3 using Implab.Automaton.RegularExpressions; |
| 4 using System.Diagnostics; | 4 using System.Diagnostics; |
| 5 using Implab.Automaton; | 5 using Implab.Automaton; |
| 6 using System.IO; | |
| 7 using System.Text; | |
| 6 | 8 |
| 7 namespace Implab.Formats { | 9 namespace Implab.Formats { |
| 8 public abstract class TextScanner<TTag> : Disposable { | 10 public abstract class TextScanner : Disposable { |
| 11 readonly int m_bufferMax; | |
| 12 readonly int m_chunkSize; | |
| 9 | 13 |
| 10 int m_maxSymbol; | 14 char[] m_buffer; |
| 11 int[] m_symbolMap; | |
| 12 | |
| 13 readonly char[] m_buffer; | |
| 14 int m_bufferOffset; | 15 int m_bufferOffset; |
| 15 int m_bufferSize; | 16 int m_bufferSize; |
| 17 int m_tokenOffset; | |
| 16 int m_tokenLength; | 18 int m_tokenLength; |
| 17 | 19 |
| 18 TTag[] m_tags; | 20 /// <summary> |
| 21 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class. | |
| 22 /// </summary> | |
| 23 /// <param name="bufferMax">Buffer max.</param> | |
| 24 /// <param name="chunkSize">Chunk size.</param> | |
| 25 protected TextScanner(int bufferMax, int chunkSize) { | |
| 26 Debug.Assert(m_chunkSize <= m_bufferMax); | |
| 19 | 27 |
| 20 protected bool ReadTokenInternal(DFAStateDescriptor<TTag>[] dfa, int state) { | 28 m_bufferMax = bufferMax; |
| 21 Debug.Assert(dfa != null); | 29 m_chunkSize = chunkSize; |
| 30 } | |
| 31 | |
| 32 /// <summary> | |
| 33 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class. | |
| 34 /// </summary> | |
| 35 /// <param name="buffer">Buffer.</param> | |
| 36 protected TextScanner(char[] buffer) { | |
| 37 if (buffer != null) { | |
| 38 m_buffer = buffer; | |
| 39 m_bufferSize = buffer.Length; | |
| 40 } | |
| 41 } | |
| 42 | |
| 43 /// <summary> | |
| 44 /// (hungry) Reads the next token. | |
| 45 /// </summary> | |
| 46 /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns> | |
| 47 /// <param name="dfa">The transition map for the automaton</param> | |
| 48 /// <param name="final">Final states of the automaton.</param> | |
| 49 /// <param name="tags">Tags.</param> | |
| 50 /// <param name="state">The initial state for the automaton.</param> | |
| 51 internal bool ReadToken<TTag>(int[,] dfa, int[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) { | |
| 52 Safe.ArgumentNotNull(); | |
| 53 m_tokenLength = 0; | |
| 54 | |
| 55 var maxSymbol = alphabet.Length - 1; | |
| 22 | 56 |
| 23 do { | 57 do { |
| 24 for (var pos = m_bufferOffset; pos < m_bufferSize; pos++) { | 58 // after the next chunk is read the offset in the buffer may change |
| 59 int pos = m_bufferOffset + m_tokenLength; | |
| 60 | |
| 61 while(pos < m_bufferSize) { | |
| 25 var ch = m_buffer[pos]; | 62 var ch = m_buffer[pos]; |
| 26 state = dfa[state].transitions[m_symbolMap[ch > m_maxSymbol ? m_maxSymbol : ch]]; | 63 |
| 64 state = dfa[state,ch > maxSymbol ? DFAConst.UNCLASSIFIED_INPUT : alphabet[ch]]; | |
| 27 if (state == DFAConst.UNREACHABLE_STATE) | 65 if (state == DFAConst.UNREACHABLE_STATE) |
| 28 break; | 66 break; |
| 67 | |
| 68 pos++; | |
| 29 } | 69 } |
| 30 } while (Feed()); | |
| 31 | 70 |
| 32 if (dfa[state].final) { | 71 m_tokenLength = pos - m_bufferOffset; |
| 72 } while (state != DFAConst.UNREACHABLE_STATE && Feed()); | |
| 33 | 73 |
| 74 m_tokenOffset = m_bufferOffset; | |
| 75 m_bufferOffset += m_tokenLength; | |
| 76 | |
| 77 if (final[state]) { | |
| 78 tag = tags[state]; | |
| 79 return true; | |
| 80 } else { | |
| 81 if (m_bufferOffset == m_bufferSize) { | |
| 82 if (m_tokenLength == 0) //EOF | |
| 83 return false; | |
| 84 | |
| 85 throw new ParserException(); | |
| 86 } | |
| 87 throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset])); | |
| 88 | |
| 34 } | 89 } |
| 35 | |
| 36 } | 90 } |
| 37 | 91 |
| 38 bool Feed() { | 92 protected void Feed(char[] buffer, int offset, int length) { |
| 93 m_buffer = buffer; | |
| 94 m_bufferOffset = offset; | |
| 95 m_bufferSize = offset + length; | |
| 96 } | |
| 39 | 97 |
| 98 protected bool Feed() { | |
| 99 if (m_chunkSize <= 0) | |
| 100 return false; | |
| 101 | |
| 102 if (m_buffer != null) { | |
| 103 var free = m_buffer.Length - m_bufferSize; | |
| 104 | |
| 105 if (free < m_chunkSize) { | |
| 106 free += m_chunkSize; | |
| 107 var used = m_bufferSize - m_bufferOffset; | |
| 108 var size = used + free; | |
| 109 | |
| 110 if (size > m_bufferMax) | |
| 111 throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached"), m_bufferMax/1024); | |
| 112 | |
| 113 var temp = new char[size]; | |
| 114 | |
| 115 var read = Read(temp, used, m_chunkSize); | |
| 116 if (read == 0) | |
| 117 return false; | |
| 118 | |
| 119 Array.Copy(m_buffer, m_bufferOffset, temp, 0, used); | |
| 120 | |
| 121 m_bufferOffset = 0; | |
| 122 m_bufferSize = used + read; | |
| 123 m_buffer = temp; | |
| 124 } | |
| 125 } else { | |
| 126 Debug.Assert(m_bufferOffset == 0); | |
| 127 m_buffer = new char[m_chunkSize]; | |
| 128 m_bufferSize = Read(m_buffer, 0, m_chunkSize); | |
| 129 return (m_bufferSize != 0); | |
| 130 } | |
| 40 } | 131 } |
| 41 | 132 |
| 42 protected abstract int Read(char[] buffer, int offset, int size); | 133 protected abstract int Read(char[] buffer, int offset, int size); |
| 43 | 134 |
| 44 protected TTag[] Tags { | 135 public string GetTokenValue() { |
| 45 get { | 136 return new String(m_buffer, m_tokenOffset, m_tokenLength); |
| 46 return m_tags; | |
| 47 } | |
| 48 } | 137 } |
| 49 | 138 |
| 139 public void CopyTokenTo(char[] buffer, int offset) { | |
| 140 m_buffer.CopyTo(buffer, offset); | |
| 141 } | |
| 142 | |
| 143 public void CopyTokenTo(StringBuilder sb) { | |
| 144 sb.Append(m_buffer, m_tokenOffset, m_tokenLength); | |
| 145 } | |
| 50 | 146 |
| 51 } | 147 } |
| 52 } | 148 } |
| 53 | 149 |
