173
|
1 using System;
|
|
2 using Implab.Components;
|
175
|
3 using System.Diagnostics;
|
|
4 using Implab.Automaton;
|
176
|
5 using System.Text;
|
173
|
6
|
|
7 namespace Implab.Formats {
|
176
|
8 public abstract class TextScanner : Disposable {
|
|
9 readonly int m_bufferMax;
|
|
10 readonly int m_chunkSize;
|
173
|
11
|
176
|
12 char[] m_buffer;
|
174
|
13 int m_bufferOffset;
|
175
|
14 int m_bufferSize;
|
176
|
15 int m_tokenOffset;
|
173
|
16 int m_tokenLength;
|
174
|
17
|
176
|
18 /// <summary>
|
177
|
19 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner"/> class.
|
176
|
20 /// </summary>
|
|
21 /// <param name="bufferMax">Buffer max.</param>
|
|
22 /// <param name="chunkSize">Chunk size.</param>
|
|
23 protected TextScanner(int bufferMax, int chunkSize) {
|
|
24 Debug.Assert(m_chunkSize <= m_bufferMax);
|
|
25
|
|
26 m_bufferMax = bufferMax;
|
|
27 m_chunkSize = chunkSize;
|
|
28 }
|
173
|
29
|
176
|
30 /// <summary>
|
177
|
31 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner"/> class.
|
176
|
32 /// </summary>
|
|
33 /// <param name="buffer">Buffer.</param>
|
|
34 protected TextScanner(char[] buffer) {
|
|
35 if (buffer != null) {
|
|
36 m_buffer = buffer;
|
|
37 m_bufferSize = buffer.Length;
|
|
38 }
|
|
39 }
|
|
40
|
|
41 /// <summary>
|
|
42 /// (hungry) Reads the next token.
|
|
43 /// </summary>
|
|
44 /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns>
|
|
45 /// <param name="dfa">The transition map for the automaton</param>
|
|
46 /// <param name="final">Final states of the automaton.</param>
|
|
47 /// <param name="tags">Tags.</param>
|
|
48 /// <param name="state">The initial state for the automaton.</param>
|
177
|
49 /// <param name="alphabet"></param>
|
|
50 /// <param name = "tag"></param>
|
|
51 internal bool ReadToken<TTag>(int[,] dfa, bool[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) {
|
176
|
52 m_tokenLength = 0;
|
180
|
53 tag = null;
|
176
|
54
|
|
55 var maxSymbol = alphabet.Length - 1;
|
174
|
56
|
175
|
57 do {
|
176
|
58 // after the next chunk is read the offset in the buffer may change
|
|
59 int pos = m_bufferOffset + m_tokenLength;
|
|
60
|
177
|
61 while (pos < m_bufferSize) {
|
175
|
62 var ch = m_buffer[pos];
|
176
|
63
|
181
|
64 try {
|
|
65 var next = dfa[state, ch > maxSymbol ? AutomatonConst.UNCLASSIFIED_INPUT : alphabet[ch]];
|
|
66
|
|
67 if (next == AutomatonConst.UNREACHABLE_STATE)
|
175
|
68 break;
|
181
|
69
|
|
70 state = next;
|
|
71 }catch {
|
|
72 throw;
|
|
73 }
|
176
|
74 pos++;
|
175
|
75 }
|
176
|
76
|
|
77 m_tokenLength = pos - m_bufferOffset;
|
178
|
78 } while (state != AutomatonConst.UNREACHABLE_STATE && Feed());
|
176
|
79
|
|
80 m_tokenOffset = m_bufferOffset;
|
|
81 m_bufferOffset += m_tokenLength;
|
174
|
82
|
176
|
83 if (final[state]) {
|
|
84 tag = tags[state];
|
|
85 return true;
|
177
|
86 }
|
|
87
|
|
88 if (m_bufferOffset == m_bufferSize) {
|
|
89 if (m_tokenLength == 0) //EOF
|
176
|
90 return false;
|
|
91
|
177
|
92 throw new ParserException();
|
|
93 }
|
|
94
|
|
95 throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset]));
|
176
|
96
|
|
97 }
|
173
|
98
|
176
|
99 protected void Feed(char[] buffer, int offset, int length) {
|
|
100 m_buffer = buffer;
|
|
101 m_bufferOffset = offset;
|
|
102 m_bufferSize = offset + length;
|
173
|
103 }
|
|
104
|
176
|
105 protected bool Feed() {
|
|
106 if (m_chunkSize <= 0)
|
|
107 return false;
|
|
108
|
|
109 if (m_buffer != null) {
|
|
110 var free = m_buffer.Length - m_bufferSize;
|
|
111
|
|
112 if (free < m_chunkSize) {
|
|
113 free += m_chunkSize;
|
|
114 var used = m_bufferSize - m_bufferOffset;
|
|
115 var size = used + free;
|
|
116
|
|
117 if (size > m_bufferMax)
|
180
|
118 throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached", m_bufferMax / 1024));
|
176
|
119
|
|
120 var temp = new char[size];
|
175
|
121
|
176
|
122 var read = Read(temp, used, m_chunkSize);
|
|
123 if (read == 0)
|
|
124 return false;
|
|
125
|
|
126 Array.Copy(m_buffer, m_bufferOffset, temp, 0, used);
|
|
127
|
|
128 m_bufferOffset = 0;
|
|
129 m_bufferSize = used + read;
|
|
130 m_buffer = temp;
|
180
|
131 } else {
|
|
132 var read = Read(m_buffer, m_bufferSize, m_chunkSize);
|
|
133 if (read == 0)
|
|
134 return false;
|
|
135 m_bufferSize += m_chunkSize;
|
176
|
136 }
|
180
|
137 return true;
|
176
|
138 } else {
|
|
139 Debug.Assert(m_bufferOffset == 0);
|
|
140 m_buffer = new char[m_chunkSize];
|
|
141 m_bufferSize = Read(m_buffer, 0, m_chunkSize);
|
|
142 return (m_bufferSize != 0);
|
|
143 }
|
175
|
144 }
|
|
145
|
|
146 protected abstract int Read(char[] buffer, int offset, int size);
|
173
|
147
|
176
|
148 public string GetTokenValue() {
|
|
149 return new String(m_buffer, m_tokenOffset, m_tokenLength);
|
173
|
150 }
|
|
151
|
176
|
152 public void CopyTokenTo(char[] buffer, int offset) {
|
|
153 m_buffer.CopyTo(buffer, offset);
|
|
154 }
|
|
155
|
|
156 public void CopyTokenTo(StringBuilder sb) {
|
|
157 sb.Append(m_buffer, m_tokenOffset, m_tokenLength);
|
|
158 }
|
175
|
159
|
173
|
160 }
|
|
161 }
|
|
162
|