Mercurial > pub > ImplabNet
comparison Implab/Formats/TextScanner.cs @ 176:0c3c69fe225b ref20160224
rewritten the text scanner
author | cin |
---|---|
date | Tue, 22 Mar 2016 18:58:40 +0300 |
parents | 96a89dcb4060 |
children | a0ff6a0e9c44 |
comparison
equal
deleted
inserted
replaced
175:96a89dcb4060 | 176:0c3c69fe225b |
---|---|
1 using System; | 1 using System; |
2 using Implab.Components; | 2 using Implab.Components; |
3 using Implab.Automaton.RegularExpressions; | 3 using Implab.Automaton.RegularExpressions; |
4 using System.Diagnostics; | 4 using System.Diagnostics; |
5 using Implab.Automaton; | 5 using Implab.Automaton; |
6 using System.IO; | |
7 using System.Text; | |
6 | 8 |
7 namespace Implab.Formats { | 9 namespace Implab.Formats { |
8 public abstract class TextScanner<TTag> : Disposable { | 10 public abstract class TextScanner : Disposable { |
11 readonly int m_bufferMax; | |
12 readonly int m_chunkSize; | |
9 | 13 |
10 int m_maxSymbol; | 14 char[] m_buffer; |
11 int[] m_symbolMap; | |
12 | |
13 readonly char[] m_buffer; | |
14 int m_bufferOffset; | 15 int m_bufferOffset; |
15 int m_bufferSize; | 16 int m_bufferSize; |
17 int m_tokenOffset; | |
16 int m_tokenLength; | 18 int m_tokenLength; |
17 | 19 |
18 TTag[] m_tags; | 20 /// <summary> |
21 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class. | |
22 /// </summary> | |
23 /// <param name="bufferMax">Buffer max.</param> | |
24 /// <param name="chunkSize">Chunk size.</param> | |
25 protected TextScanner(int bufferMax, int chunkSize) { | |
26 Debug.Assert(m_chunkSize <= m_bufferMax); | |
19 | 27 |
20 protected bool ReadTokenInternal(DFAStateDescriptor<TTag>[] dfa, int state) { | 28 m_bufferMax = bufferMax; |
21 Debug.Assert(dfa != null); | 29 m_chunkSize = chunkSize; |
30 } | |
31 | |
32 /// <summary> | |
33 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class. | |
34 /// </summary> | |
35 /// <param name="buffer">Buffer.</param> | |
36 protected TextScanner(char[] buffer) { | |
37 if (buffer != null) { | |
38 m_buffer = buffer; | |
39 m_bufferSize = buffer.Length; | |
40 } | |
41 } | |
42 | |
43 /// <summary> | |
44 /// (hungry) Reads the next token. | |
45 /// </summary> | |
46 /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns> | |
47 /// <param name="dfa">The transition map for the automaton</param> | |
48 /// <param name="final">Final states of the automaton.</param> | |
49 /// <param name="tags">Tags.</param> | |
50 /// <param name="state">The initial state for the automaton.</param> | |
51 internal bool ReadToken<TTag>(int[,] dfa, int[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) { | |
52 Safe.ArgumentNotNull(); | |
53 m_tokenLength = 0; | |
54 | |
55 var maxSymbol = alphabet.Length - 1; | |
22 | 56 |
23 do { | 57 do { |
24 for (var pos = m_bufferOffset; pos < m_bufferSize; pos++) { | 58 // after the next chunk is read the offset in the buffer may change |
59 int pos = m_bufferOffset + m_tokenLength; | |
60 | |
61 while(pos < m_bufferSize) { | |
25 var ch = m_buffer[pos]; | 62 var ch = m_buffer[pos]; |
26 state = dfa[state].transitions[m_symbolMap[ch > m_maxSymbol ? m_maxSymbol : ch]]; | 63 |
64 state = dfa[state,ch > maxSymbol ? DFAConst.UNCLASSIFIED_INPUT : alphabet[ch]]; | |
27 if (state == DFAConst.UNREACHABLE_STATE) | 65 if (state == DFAConst.UNREACHABLE_STATE) |
28 break; | 66 break; |
67 | |
68 pos++; | |
29 } | 69 } |
30 } while (Feed()); | |
31 | 70 |
32 if (dfa[state].final) { | 71 m_tokenLength = pos - m_bufferOffset; |
72 } while (state != DFAConst.UNREACHABLE_STATE && Feed()); | |
33 | 73 |
74 m_tokenOffset = m_bufferOffset; | |
75 m_bufferOffset += m_tokenLength; | |
76 | |
77 if (final[state]) { | |
78 tag = tags[state]; | |
79 return true; | |
80 } else { | |
81 if (m_bufferOffset == m_bufferSize) { | |
82 if (m_tokenLength == 0) //EOF | |
83 return false; | |
84 | |
85 throw new ParserException(); | |
86 } | |
87 throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset])); | |
88 | |
34 } | 89 } |
35 | |
36 } | 90 } |
37 | 91 |
38 bool Feed() { | 92 protected void Feed(char[] buffer, int offset, int length) { |
93 m_buffer = buffer; | |
94 m_bufferOffset = offset; | |
95 m_bufferSize = offset + length; | |
96 } | |
39 | 97 |
98 protected bool Feed() { | |
99 if (m_chunkSize <= 0) | |
100 return false; | |
101 | |
102 if (m_buffer != null) { | |
103 var free = m_buffer.Length - m_bufferSize; | |
104 | |
105 if (free < m_chunkSize) { | |
106 free += m_chunkSize; | |
107 var used = m_bufferSize - m_bufferOffset; | |
108 var size = used + free; | |
109 | |
110 if (size > m_bufferMax) | |
111 throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached"), m_bufferMax/1024); | |
112 | |
113 var temp = new char[size]; | |
114 | |
115 var read = Read(temp, used, m_chunkSize); | |
116 if (read == 0) | |
117 return false; | |
118 | |
119 Array.Copy(m_buffer, m_bufferOffset, temp, 0, used); | |
120 | |
121 m_bufferOffset = 0; | |
122 m_bufferSize = used + read; | |
123 m_buffer = temp; | |
124 } | |
125 } else { | |
126 Debug.Assert(m_bufferOffset == 0); | |
127 m_buffer = new char[m_chunkSize]; | |
128 m_bufferSize = Read(m_buffer, 0, m_chunkSize); | |
129 return (m_bufferSize != 0); | |
130 } | |
40 } | 131 } |
41 | 132 |
42 protected abstract int Read(char[] buffer, int offset, int size); | 133 protected abstract int Read(char[] buffer, int offset, int size); |
43 | 134 |
44 protected TTag[] Tags { | 135 public string GetTokenValue() { |
45 get { | 136 return new String(m_buffer, m_tokenOffset, m_tokenLength); |
46 return m_tags; | |
47 } | |
48 } | 137 } |
49 | 138 |
139 public void CopyTokenTo(char[] buffer, int offset) { | |
140 m_buffer.CopyTo(buffer, offset); | |
141 } | |
142 | |
143 public void CopyTokenTo(StringBuilder sb) { | |
144 sb.Append(m_buffer, m_tokenOffset, m_tokenLength); | |
145 } | |
50 | 146 |
51 } | 147 } |
52 } | 148 } |
53 | 149 |