annotate Implab/Automaton/Scanner.cs @ 175:96a89dcb4060 ref20160224

sync
author cin
date Mon, 21 Mar 2016 18:41:45 +0300
parents 983df35b3ca1
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
174
cin
parents: 172
diff changeset
1 using Implab;
cin
parents: 172
diff changeset
2 using System;
cin
parents: 172
diff changeset
3 using System.Collections.Generic;
cin
parents: 172
diff changeset
4 using System.IO;
cin
parents: 172
diff changeset
5 using Implab.Components;
cin
parents: 172
diff changeset
6 using Implab.Automaton.RegularExpressions;
cin
parents: 172
diff changeset
7
cin
parents: 172
diff changeset
8 namespace Implab.Automaton {
cin
parents: 172
diff changeset
9 /// <summary>
cin
parents: 172
diff changeset
10 /// Базовый класс для разбора потока входных символов на токены.
cin
parents: 172
diff changeset
11 /// </summary>
cin
parents: 172
diff changeset
12 /// <remarks>
cin
parents: 172
diff changeset
13 /// Сканнер имеет внутри буффер с симолами входного текста, по которому перемещаются два
cin
parents: 172
diff changeset
14 /// указателя, начала и конца токена, при перемещении искользуется ДКА для определения
cin
parents: 172
diff changeset
15 /// конца токена и допустимости текущего символа.
cin
parents: 172
diff changeset
16 /// </remarks>
cin
parents: 172
diff changeset
17 public abstract class Scanner<TTag> : Disposable {
cin
parents: 172
diff changeset
18 protected struct ScannerConfig {
cin
parents: 172
diff changeset
19 public readonly DFAStateDescriptor<TTag>[] states;
cin
parents: 172
diff changeset
20 public readonly int[] alphabet;
cin
parents: 172
diff changeset
21 public readonly int initialState;
cin
parents: 172
diff changeset
22
cin
parents: 172
diff changeset
23 public ScannerConfig(DFAStateDescriptor<TTag>[] states, int[] alphabet, int initialState) {
cin
parents: 172
diff changeset
24 this.initialState = initialState;
cin
parents: 172
diff changeset
25 this.alphabet = alphabet;
cin
parents: 172
diff changeset
26 this.states = states;
cin
parents: 172
diff changeset
27 }
cin
parents: 172
diff changeset
28 }
cin
parents: 172
diff changeset
29
cin
parents: 172
diff changeset
30 Stack<ScannerConfig> m_defs = new Stack<ScannerConfig>();
cin
parents: 172
diff changeset
31
cin
parents: 172
diff changeset
32 ScannerConfig m_config;
cin
parents: 172
diff changeset
33
cin
parents: 172
diff changeset
34 protected DFAStateDescriptor<TTag> m_currentState;
cin
parents: 172
diff changeset
35 int m_previewCode;
cin
parents: 172
diff changeset
36
cin
parents: 172
diff changeset
37 protected int m_tokenLen;
cin
parents: 172
diff changeset
38 protected int m_tokenOffset;
cin
parents: 172
diff changeset
39
cin
parents: 172
diff changeset
40 protected char[] m_buffer;
cin
parents: 172
diff changeset
41 protected int m_bufferSize;
cin
parents: 172
diff changeset
42 protected int m_pointer;
cin
parents: 172
diff changeset
43
cin
parents: 172
diff changeset
44 TextReader m_reader;
cin
parents: 172
diff changeset
45 bool m_disposeReader;
cin
parents: 172
diff changeset
46 int m_chunkSize = 1024; // 1k
cin
parents: 172
diff changeset
47 int m_limit = 10 * 1024 * 1024; // 10Mb
cin
parents: 172
diff changeset
48
cin
parents: 172
diff changeset
49 protected Scanner(ScannerConfig config) {
cin
parents: 172
diff changeset
50 Safe.ArgumentNotEmpty(config.states, "config.states");
cin
parents: 172
diff changeset
51 Safe.ArgumentNotNull(config.alphabet, "config.alphabet");
cin
parents: 172
diff changeset
52
cin
parents: 172
diff changeset
53 m_config = config;
cin
parents: 172
diff changeset
54 }
cin
parents: 172
diff changeset
55
cin
parents: 172
diff changeset
56 /// <summary>
cin
parents: 172
diff changeset
57 /// Заполняет входными данными буффер.
cin
parents: 172
diff changeset
58 /// </summary>
cin
parents: 172
diff changeset
59 /// <param name="data">Данные для обработки.</param>
cin
parents: 172
diff changeset
60 /// <remarks>Копирование данных не происходит, переданный массив используется в
cin
parents: 172
diff changeset
61 /// качестве входного буффера.</remarks>
cin
parents: 172
diff changeset
62 public void Feed(char[] data) {
cin
parents: 172
diff changeset
63 Safe.ArgumentNotNull(data, "data");
cin
parents: 172
diff changeset
64
cin
parents: 172
diff changeset
65 Feed(data, data.Length);
cin
parents: 172
diff changeset
66 }
cin
parents: 172
diff changeset
67
cin
parents: 172
diff changeset
68 /// <summary>
cin
parents: 172
diff changeset
69 /// Заполняет буффур чтения входными данными.
cin
parents: 172
diff changeset
70 /// </summary>
cin
parents: 172
diff changeset
71 /// <param name="data">Данные для обработки.</param>
cin
parents: 172
diff changeset
72 /// <param name="length">Длина данных для обработки.</param>
cin
parents: 172
diff changeset
73 /// <remarks>Копирование данных не происходит, переданный массив используется в
cin
parents: 172
diff changeset
74 /// качестве входного буффера.</remarks>
cin
parents: 172
diff changeset
75 public void Feed(char[] data, int length) {
cin
parents: 172
diff changeset
76 Safe.ArgumentNotNull(data, "data");
cin
parents: 172
diff changeset
77 Safe.ArgumentInRange(length, 0, data.Length, "length");
cin
parents: 172
diff changeset
78 AssertNotDisposed();
cin
parents: 172
diff changeset
79
cin
parents: 172
diff changeset
80 m_pointer = -1;
cin
parents: 172
diff changeset
81 m_buffer = data;
cin
parents: 172
diff changeset
82 m_bufferSize = length;
cin
parents: 172
diff changeset
83 Shift();
cin
parents: 172
diff changeset
84 }
cin
parents: 172
diff changeset
85
cin
parents: 172
diff changeset
86 public void Feed(TextReader reader, bool dispose) {
cin
parents: 172
diff changeset
87 Safe.ArgumentNotNull(reader, "reader");
cin
parents: 172
diff changeset
88 AssertNotDisposed();
cin
parents: 172
diff changeset
89
cin
parents: 172
diff changeset
90 if (m_reader != null && m_disposeReader)
cin
parents: 172
diff changeset
91 m_reader.Dispose();
cin
parents: 172
diff changeset
92
cin
parents: 172
diff changeset
93 m_reader = reader;
cin
parents: 172
diff changeset
94 m_disposeReader = dispose;
cin
parents: 172
diff changeset
95 m_pointer = -1;
cin
parents: 172
diff changeset
96 m_buffer = new char[m_chunkSize];
cin
parents: 172
diff changeset
97 m_bufferSize = 0;
cin
parents: 172
diff changeset
98 Shift();
cin
parents: 172
diff changeset
99 }
cin
parents: 172
diff changeset
100
cin
parents: 172
diff changeset
101 /// <summary>
cin
parents: 172
diff changeset
102 /// Получает текущий токен в виде строки.
cin
parents: 172
diff changeset
103 /// </summary>
cin
parents: 172
diff changeset
104 /// <returns></returns>
cin
parents: 172
diff changeset
105 protected string GetTokenValue() {
cin
parents: 172
diff changeset
106 return new String(m_buffer, m_tokenOffset, m_tokenLen);
cin
parents: 172
diff changeset
107 }
cin
parents: 172
diff changeset
108
cin
parents: 172
diff changeset
109 /// <summary>
cin
parents: 172
diff changeset
110 /// Метки текущего токена, которые были назначены в регулярном выражении.
cin
parents: 172
diff changeset
111 /// </summary>
cin
parents: 172
diff changeset
112 protected TTag[] TokenTags {
cin
parents: 172
diff changeset
113 get {
cin
parents: 172
diff changeset
114 return m_currentState.tags;
cin
parents: 172
diff changeset
115 }
cin
parents: 172
diff changeset
116 }
cin
parents: 172
diff changeset
117
cin
parents: 172
diff changeset
118 /// <summary>
cin
parents: 172
diff changeset
119 /// Признак конца данных
cin
parents: 172
diff changeset
120 /// </summary>
cin
parents: 172
diff changeset
121 public bool EOF {
cin
parents: 172
diff changeset
122 get {
cin
parents: 172
diff changeset
123 return m_pointer >= m_bufferSize;
cin
parents: 172
diff changeset
124 }
cin
parents: 172
diff changeset
125 }
cin
parents: 172
diff changeset
126
cin
parents: 172
diff changeset
127 /// <summary>
cin
parents: 172
diff changeset
128 /// Читает следующий токен, при этом <see cref="m_tokenOffset"/> указывает на начало токена,
cin
parents: 172
diff changeset
129 /// <see cref="m_tokenLen"/> на длину токена, <see cref="m_buffer"/> - массив символов, в
cin
parents: 172
diff changeset
130 /// котором находится токен.
cin
parents: 172
diff changeset
131 /// </summary>
cin
parents: 172
diff changeset
132 /// <returns><c>false</c> - достигнут конец данных, токен не прочитан.</returns>
cin
parents: 172
diff changeset
133 protected bool ReadTokenInternal() {
cin
parents: 172
diff changeset
134 if (m_pointer >= m_bufferSize)
cin
parents: 172
diff changeset
135 return false;
cin
parents: 172
diff changeset
136
cin
parents: 172
diff changeset
137 m_currentState = m_config.states[m_config.initialState];
cin
parents: 172
diff changeset
138 m_tokenLen = 0;
cin
parents: 172
diff changeset
139 m_tokenOffset = m_pointer;
cin
parents: 172
diff changeset
140 int nextState;
cin
parents: 172
diff changeset
141 do {
cin
parents: 172
diff changeset
142 nextState = m_currentState.transitions[m_previewCode];
cin
parents: 172
diff changeset
143 if (nextState == DFAConst.UNREACHABLE_STATE) {
cin
parents: 172
diff changeset
144 if (m_currentState.final)
cin
parents: 172
diff changeset
145 return true;
cin
parents: 172
diff changeset
146
cin
parents: 172
diff changeset
147 throw new ParserException(
cin
parents: 172
diff changeset
148 String.Format(
cin
parents: 172
diff changeset
149 "Unexpected symbol '{0}', at pos {1}",
cin
parents: 172
diff changeset
150 m_buffer[m_pointer],
cin
parents: 172
diff changeset
151 Position
cin
parents: 172
diff changeset
152 )
cin
parents: 172
diff changeset
153 );
cin
parents: 172
diff changeset
154 }
cin
parents: 172
diff changeset
155 m_currentState = m_config.states[nextState];
cin
parents: 172
diff changeset
156 m_tokenLen++;
cin
parents: 172
diff changeset
157
cin
parents: 172
diff changeset
158 } while (Shift());
cin
parents: 172
diff changeset
159
cin
parents: 172
diff changeset
160 // END OF DATA
cin
parents: 172
diff changeset
161 if (!m_currentState.final)
cin
parents: 172
diff changeset
162 throw new ParserException("Unexpected end of data");
cin
parents: 172
diff changeset
163
cin
parents: 172
diff changeset
164 return true;
cin
parents: 172
diff changeset
165 }
cin
parents: 172
diff changeset
166
cin
parents: 172
diff changeset
167
cin
parents: 172
diff changeset
168 bool Shift() {
cin
parents: 172
diff changeset
169 m_pointer++;
cin
parents: 172
diff changeset
170
cin
parents: 172
diff changeset
171 if (m_pointer >= m_bufferSize) {
cin
parents: 172
diff changeset
172 if (!ReadNextChunk())
cin
parents: 172
diff changeset
173 return false;
cin
parents: 172
diff changeset
174 }
cin
parents: 172
diff changeset
175
cin
parents: 172
diff changeset
176 m_previewCode = m_config.alphabet[m_buffer[m_pointer]];
cin
parents: 172
diff changeset
177
cin
parents: 172
diff changeset
178 return true;
cin
parents: 172
diff changeset
179 }
cin
parents: 172
diff changeset
180
cin
parents: 172
diff changeset
181 bool ReadNextChunk() {
cin
parents: 172
diff changeset
182 if (m_reader == null)
cin
parents: 172
diff changeset
183 return false;
cin
parents: 172
diff changeset
184
cin
parents: 172
diff changeset
185 // extend buffer if nesessary
cin
parents: 172
diff changeset
186 if (m_pointer + m_chunkSize > m_buffer.Length) {
cin
parents: 172
diff changeset
187 // trim unused buffer head
cin
parents: 172
diff changeset
188 var size = m_tokenLen + m_chunkSize;
cin
parents: 172
diff changeset
189 if (size >= m_limit)
cin
parents: 172
diff changeset
190 throw new ParserException(String.Format("Input buffer {0} bytes limit exceeded", m_limit));
cin
parents: 172
diff changeset
191 var temp = new char[size];
cin
parents: 172
diff changeset
192 Array.Copy(m_buffer, m_tokenOffset, temp, 0, m_tokenLen);
cin
parents: 172
diff changeset
193 m_pointer -= m_tokenOffset;
cin
parents: 172
diff changeset
194 m_bufferSize -= m_tokenOffset;
cin
parents: 172
diff changeset
195 m_tokenOffset = 0;
cin
parents: 172
diff changeset
196 m_buffer = temp;
cin
parents: 172
diff changeset
197 }
cin
parents: 172
diff changeset
198
cin
parents: 172
diff changeset
199 var read = m_reader.Read(m_buffer, m_tokenLen, m_chunkSize);
cin
parents: 172
diff changeset
200 if (read == 0)
cin
parents: 172
diff changeset
201 return false;
cin
parents: 172
diff changeset
202
cin
parents: 172
diff changeset
203 m_bufferSize += read;
cin
parents: 172
diff changeset
204
cin
parents: 172
diff changeset
205 return true;
cin
parents: 172
diff changeset
206 }
cin
parents: 172
diff changeset
207
cin
parents: 172
diff changeset
208 /// <summary>
cin
parents: 172
diff changeset
209 /// Позиция сканнера во входном буфере
cin
parents: 172
diff changeset
210 /// </summary>
cin
parents: 172
diff changeset
211 public int Position {
cin
parents: 172
diff changeset
212 get {
cin
parents: 172
diff changeset
213 return m_pointer + 1;
cin
parents: 172
diff changeset
214 }
cin
parents: 172
diff changeset
215 }
cin
parents: 172
diff changeset
216
cin
parents: 172
diff changeset
217 /// <summary>
cin
parents: 172
diff changeset
218 /// Преключает внутренний ДКА на указанный, позволяет реализовать подобие захватывающей
cin
parents: 172
diff changeset
219 /// группировки.
cin
parents: 172
diff changeset
220 /// </summary>
cin
parents: 172
diff changeset
221 /// <param name = "config"></param>
cin
parents: 172
diff changeset
222 protected void Switch(ScannerConfig config) {
cin
parents: 172
diff changeset
223 Safe.ArgumentNotNull(config.states, "config.states");
cin
parents: 172
diff changeset
224
cin
parents: 172
diff changeset
225 m_defs.Push(m_config);
cin
parents: 172
diff changeset
226 m_config = config;
cin
parents: 172
diff changeset
227
cin
parents: 172
diff changeset
228 m_previewCode = m_config.alphabet[m_buffer[m_pointer]];
cin
parents: 172
diff changeset
229 }
cin
parents: 172
diff changeset
230
cin
parents: 172
diff changeset
231 /// <summary>
cin
parents: 172
diff changeset
232 /// Восстанавливает предыдущей ДКА сканнера.
cin
parents: 172
diff changeset
233 /// </summary>
cin
parents: 172
diff changeset
234 protected void Restore() {
cin
parents: 172
diff changeset
235 if (m_defs.Count == 0)
cin
parents: 172
diff changeset
236 throw new InvalidOperationException();
cin
parents: 172
diff changeset
237 m_config = m_defs.Pop();
cin
parents: 172
diff changeset
238
cin
parents: 172
diff changeset
239 m_previewCode = m_config.alphabet[m_buffer[m_pointer]];
cin
parents: 172
diff changeset
240 }
cin
parents: 172
diff changeset
241
cin
parents: 172
diff changeset
242 protected override void Dispose(bool disposing) {
cin
parents: 172
diff changeset
243 if (disposing) {
cin
parents: 172
diff changeset
244 if (m_reader != null && m_disposeReader)
cin
parents: 172
diff changeset
245 m_reader.Dispose();
cin
parents: 172
diff changeset
246 m_buffer = null;
cin
parents: 172
diff changeset
247 m_bufferSize = 0;
cin
parents: 172
diff changeset
248 m_pointer = 0;
cin
parents: 172
diff changeset
249 m_tokenLen = 0;
cin
parents: 172
diff changeset
250 m_tokenOffset = 0;
cin
parents: 172
diff changeset
251 }
cin
parents: 172
diff changeset
252 base.Dispose(disposing);
cin
parents: 172
diff changeset
253 }
cin
parents: 172
diff changeset
254 }
cin
parents: 172
diff changeset
255 }