annotate Implab/Formats/TextScanner.cs @ 178:d5c5db0335ee ref20160224

working on JSON parser
author cin
date Wed, 23 Mar 2016 19:51:45 +0300
parents a0ff6a0e9c44
children c32688129f14
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
1 using System;
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
2 using Implab.Components;
175
cin
parents: 174
diff changeset
3 using System.Diagnostics;
cin
parents: 174
diff changeset
4 using Implab.Automaton;
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
5 using System.Text;
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
6
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
7 namespace Implab.Formats {
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
8 public abstract class TextScanner : Disposable {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
9 readonly int m_bufferMax;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
10 readonly int m_chunkSize;
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
11
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
12 char[] m_buffer;
174
cin
parents: 173
diff changeset
13 int m_bufferOffset;
175
cin
parents: 174
diff changeset
14 int m_bufferSize;
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
15 int m_tokenOffset;
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
16 int m_tokenLength;
174
cin
parents: 173
diff changeset
17
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
18 /// <summary>
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
19 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner"/> class.
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
20 /// </summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
21 /// <param name="bufferMax">Buffer max.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
22 /// <param name="chunkSize">Chunk size.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
23 protected TextScanner(int bufferMax, int chunkSize) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
24 Debug.Assert(m_chunkSize <= m_bufferMax);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
25
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
26 m_bufferMax = bufferMax;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
27 m_chunkSize = chunkSize;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
28 }
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
29
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
30 /// <summary>
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
31 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner"/> class.
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
32 /// </summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
33 /// <param name="buffer">Buffer.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
34 protected TextScanner(char[] buffer) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
35 if (buffer != null) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
36 m_buffer = buffer;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
37 m_bufferSize = buffer.Length;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
38 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
39 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
40
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
41 /// <summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
42 /// (hungry) Reads the next token.
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
43 /// </summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
44 /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
45 /// <param name="dfa">The transition map for the automaton</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
46 /// <param name="final">Final states of the automaton.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
47 /// <param name="tags">Tags.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
48 /// <param name="state">The initial state for the automaton.</param>
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
49 /// <param name="alphabet"></param>
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
50 /// <param name = "tag"></param>
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
51 internal bool ReadToken<TTag>(int[,] dfa, bool[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) {
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
52 Safe.ArgumentNotNull();
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
53 m_tokenLength = 0;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
54
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
55 var maxSymbol = alphabet.Length - 1;
174
cin
parents: 173
diff changeset
56
175
cin
parents: 174
diff changeset
57 do {
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
58 // after the next chunk is read the offset in the buffer may change
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
59 int pos = m_bufferOffset + m_tokenLength;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
60
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
61 while (pos < m_bufferSize) {
175
cin
parents: 174
diff changeset
62 var ch = m_buffer[pos];
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
63
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
64 state = dfa[state, ch > maxSymbol ? AutomatonConst.UNCLASSIFIED_INPUT : alphabet[ch]];
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
65 if (state == AutomatonConst.UNREACHABLE_STATE)
175
cin
parents: 174
diff changeset
66 break;
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
67
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
68 pos++;
175
cin
parents: 174
diff changeset
69 }
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
70
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
71 m_tokenLength = pos - m_bufferOffset;
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
72 } while (state != AutomatonConst.UNREACHABLE_STATE && Feed());
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
73
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
74 m_tokenOffset = m_bufferOffset;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
75 m_bufferOffset += m_tokenLength;
174
cin
parents: 173
diff changeset
76
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
77 if (final[state]) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
78 tag = tags[state];
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
79 return true;
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
80 }
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
81
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
82 if (m_bufferOffset == m_bufferSize) {
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
83 if (m_tokenLength == 0) //EOF
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
84 return false;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
85
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
86 throw new ParserException();
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
87 }
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
88
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
89 throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset]));
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
90
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
91 }
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
92
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
93 protected void Feed(char[] buffer, int offset, int length) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
94 m_buffer = buffer;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
95 m_bufferOffset = offset;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
96 m_bufferSize = offset + length;
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
97 }
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
98
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
99 protected bool Feed() {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
100 if (m_chunkSize <= 0)
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
101 return false;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
102
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
103 if (m_buffer != null) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
104 var free = m_buffer.Length - m_bufferSize;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
105
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
106 if (free < m_chunkSize) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
107 free += m_chunkSize;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
108 var used = m_bufferSize - m_bufferOffset;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
109 var size = used + free;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
110
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
111 if (size > m_bufferMax)
177
a0ff6a0e9c44 refactoring
cin
parents: 176
diff changeset
112 throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached", m_bufferMax/1024));
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
113
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
114 var temp = new char[size];
175
cin
parents: 174
diff changeset
115
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
116 var read = Read(temp, used, m_chunkSize);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
117 if (read == 0)
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
118 return false;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
119
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
120 Array.Copy(m_buffer, m_bufferOffset, temp, 0, used);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
121
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
122 m_bufferOffset = 0;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
123 m_bufferSize = used + read;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
124 m_buffer = temp;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
125 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
126 } else {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
127 Debug.Assert(m_bufferOffset == 0);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
128 m_buffer = new char[m_chunkSize];
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
129 m_bufferSize = Read(m_buffer, 0, m_chunkSize);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
130 return (m_bufferSize != 0);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
131 }
175
cin
parents: 174
diff changeset
132 }
cin
parents: 174
diff changeset
133
cin
parents: 174
diff changeset
134 protected abstract int Read(char[] buffer, int offset, int size);
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
135
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
136 public string GetTokenValue() {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
137 return new String(m_buffer, m_tokenOffset, m_tokenLength);
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
138 }
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
139
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
140 public void CopyTokenTo(char[] buffer, int offset) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
141 m_buffer.CopyTo(buffer, offset);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
142 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
143
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
144 public void CopyTokenTo(StringBuilder sb) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
145 sb.Append(m_buffer, m_tokenOffset, m_tokenLength);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
146 }
175
cin
parents: 174
diff changeset
147
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
148 }
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
149 }
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
150