annotate Implab/Formats/TextScanner.cs @ 183:4f82e0f161c3 ref20160224

fixed DFA optimization, JSON is fully functional
author cin
date Fri, 25 Mar 2016 02:49:02 +0300
parents 76e8f2ba12b8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
1 using System;
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
2 using Implab.Components;
175
cin
parents: 174
diff changeset
3 using System.Diagnostics;
cin
parents: 174
diff changeset
4 using Implab.Automaton;
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
5 using System.Text;
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
6
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
7 namespace Implab.Formats {
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
8 public abstract class TextScanner : Disposable {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
9 readonly int m_bufferMax;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
10 readonly int m_chunkSize;
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
11
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
12 char[] m_buffer;
174
cin
parents: 173
diff changeset
13 int m_bufferOffset;
175
cin
parents: 174
diff changeset
14 int m_bufferSize;
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
15 int m_tokenOffset;
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
16 int m_tokenLength;
174
cin
parents: 173
diff changeset
17
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
18 /// <summary>
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
19 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner"/> class.
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
20 /// </summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
21 /// <param name="bufferMax">Buffer max.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
22 /// <param name="chunkSize">Chunk size.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
23 protected TextScanner(int bufferMax, int chunkSize) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
24 Debug.Assert(m_chunkSize <= m_bufferMax);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
25
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
26 m_bufferMax = bufferMax;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
27 m_chunkSize = chunkSize;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
28 }
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
29
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
30 /// <summary>
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
31 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner"/> class.
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
32 /// </summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
33 /// <param name="buffer">Buffer.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
34 protected TextScanner(char[] buffer) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
35 if (buffer != null) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
36 m_buffer = buffer;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
37 m_bufferSize = buffer.Length;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
38 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
39 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
40
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
41 /// <summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
42 /// (hungry) Reads the next token.
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
43 /// </summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
44 /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
45 /// <param name="dfa">The transition map for the automaton</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
46 /// <param name="final">Final states of the automaton.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
47 /// <param name="tags">Tags.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
48 /// <param name="state">The initial state for the automaton.</param>
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
49 /// <param name="alphabet"></param>
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
50 /// <param name = "tag"></param>
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
51 internal bool ReadToken<TTag>(int[,] dfa, bool[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) {
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
52 m_tokenLength = 0;
180
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
53 tag = null;
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
54
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
55 var maxSymbol = alphabet.Length - 1;
182
76e8f2ba12b8 pretty print DFA, the minimization is still buggy
cin
parents: 181
diff changeset
56 int next;
175
cin
parents: 174
diff changeset
57 do {
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
58 // after the next chunk is read the offset in the buffer may change
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
59 int pos = m_bufferOffset + m_tokenLength;
182
76e8f2ba12b8 pretty print DFA, the minimization is still buggy
cin
parents: 181
diff changeset
60 next = state;
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
61 while (pos < m_bufferSize) {
175
cin
parents: 174
diff changeset
62 var ch = m_buffer[pos];
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
63
182
76e8f2ba12b8 pretty print DFA, the minimization is still buggy
cin
parents: 181
diff changeset
64 next = dfa[next, ch > maxSymbol ? AutomatonConst.UNCLASSIFIED_INPUT : alphabet[ch]];
181
b2b6a6640aa3 minor fixes and debug
cin
parents: 180
diff changeset
65
b2b6a6640aa3 minor fixes and debug
cin
parents: 180
diff changeset
66 if (next == AutomatonConst.UNREACHABLE_STATE)
175
cin
parents: 174
diff changeset
67 break;
182
76e8f2ba12b8 pretty print DFA, the minimization is still buggy
cin
parents: 181
diff changeset
68
181
b2b6a6640aa3 minor fixes and debug
cin
parents: 180
diff changeset
69 state = next;
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
70 pos++;
175
cin
parents: 174
diff changeset
71 }
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
72 m_tokenLength = pos - m_bufferOffset;
182
76e8f2ba12b8 pretty print DFA, the minimization is still buggy
cin
parents: 181
diff changeset
73 } while (next != AutomatonConst.UNREACHABLE_STATE && Feed());
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
74
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
75 m_tokenOffset = m_bufferOffset;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
76 m_bufferOffset += m_tokenLength;
174
cin
parents: 173
diff changeset
77
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
78 if (final[state]) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
79 tag = tags[state];
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
80 return true;
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
81 }
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
82
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
83 if (m_bufferOffset == m_bufferSize) {
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
84 if (m_tokenLength == 0) //EOF
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
85 return false;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
86
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
87 throw new ParserException();
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
88 }
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
89
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
90 throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset]));
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
91
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
92 }
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
93
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
94 protected void Feed(char[] buffer, int offset, int length) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
95 m_buffer = buffer;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
96 m_bufferOffset = offset;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
97 m_bufferSize = offset + length;
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
98 }
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
99
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
100 protected bool Feed() {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
101 if (m_chunkSize <= 0)
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
102 return false;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
103
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
104 if (m_buffer != null) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
105 var free = m_buffer.Length - m_bufferSize;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
106
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
107 if (free < m_chunkSize) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
108 free += m_chunkSize;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
109 var used = m_bufferSize - m_bufferOffset;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
110 var size = used + free;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
111
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
112 if (size > m_bufferMax)
180
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
113 throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached", m_bufferMax / 1024));
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
114
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
115 var temp = new char[size];
175
cin
parents: 174
diff changeset
116
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
117 var read = Read(temp, used, m_chunkSize);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
118 if (read == 0)
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
119 return false;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
120
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
121 Array.Copy(m_buffer, m_bufferOffset, temp, 0, used);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
122
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
123 m_bufferOffset = 0;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
124 m_bufferSize = used + read;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
125 m_buffer = temp;
180
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
126 } else {
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
127 var read = Read(m_buffer, m_bufferSize, m_chunkSize);
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
128 if (read == 0)
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
129 return false;
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
130 m_bufferSize += m_chunkSize;
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
131 }
180
c32688129f14 refactoring complete, JSONParser rewritten
cin
parents: 178
diff changeset
132 return true;
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
133 } else {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
134 Debug.Assert(m_bufferOffset == 0);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
135 m_buffer = new char[m_chunkSize];
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
136 m_bufferSize = Read(m_buffer, 0, m_chunkSize);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
137 return (m_bufferSize != 0);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
138 }
175
cin
parents: 174
diff changeset
139 }
cin
parents: 174
diff changeset
140
cin
parents: 174
diff changeset
141 protected abstract int Read(char[] buffer, int offset, int size);
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
142
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
143 public string GetTokenValue() {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
144 return new String(m_buffer, m_tokenOffset, m_tokenLength);
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
145 }
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
146
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
147 public void CopyTokenTo(char[] buffer, int offset) {
182
76e8f2ba12b8 pretty print DFA, the minimization is still buggy
cin
parents: 181
diff changeset
148 Array.Copy(m_buffer, m_tokenOffset,buffer, offset, m_tokenLength);
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
149 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
150
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
151 public void CopyTokenTo(StringBuilder sb) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
152 sb.Append(m_buffer, m_tokenOffset, m_tokenLength);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
153 }
175
cin
parents: 174
diff changeset
154
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
155 }
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
156 }
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
157