annotate Implab/Formats/TextScanner.cs @ 176:0c3c69fe225b ref20160224

rewritten the text scanner
author cin
date Tue, 22 Mar 2016 18:58:40 +0300
parents 96a89dcb4060
children a0ff6a0e9c44
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
1 using System;
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
2 using Implab.Components;
175
cin
parents: 174
diff changeset
3 using Implab.Automaton.RegularExpressions;
cin
parents: 174
diff changeset
4 using System.Diagnostics;
cin
parents: 174
diff changeset
5 using Implab.Automaton;
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
6 using System.IO;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
7 using System.Text;
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
8
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
9 namespace Implab.Formats {
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
10 public abstract class TextScanner : Disposable {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
11 readonly int m_bufferMax;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
12 readonly int m_chunkSize;
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
13
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
14 char[] m_buffer;
174
cin
parents: 173
diff changeset
15 int m_bufferOffset;
175
cin
parents: 174
diff changeset
16 int m_bufferSize;
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
17 int m_tokenOffset;
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
18 int m_tokenLength;
174
cin
parents: 173
diff changeset
19
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
20 /// <summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
21 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class.
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
22 /// </summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
23 /// <param name="bufferMax">Buffer max.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
24 /// <param name="chunkSize">Chunk size.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
25 protected TextScanner(int bufferMax, int chunkSize) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
26 Debug.Assert(m_chunkSize <= m_bufferMax);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
27
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
28 m_bufferMax = bufferMax;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
29 m_chunkSize = chunkSize;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
30 }
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
31
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
32 /// <summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
33 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class.
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
34 /// </summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
35 /// <param name="buffer">Buffer.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
36 protected TextScanner(char[] buffer) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
37 if (buffer != null) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
38 m_buffer = buffer;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
39 m_bufferSize = buffer.Length;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
40 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
41 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
42
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
43 /// <summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
44 /// (hungry) Reads the next token.
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
45 /// </summary>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
46 /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
47 /// <param name="dfa">The transition map for the automaton</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
48 /// <param name="final">Final states of the automaton.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
49 /// <param name="tags">Tags.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
50 /// <param name="state">The initial state for the automaton.</param>
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
51 internal bool ReadToken<TTag>(int[,] dfa, int[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
52 Safe.ArgumentNotNull();
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
53 m_tokenLength = 0;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
54
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
55 var maxSymbol = alphabet.Length - 1;
174
cin
parents: 173
diff changeset
56
175
cin
parents: 174
diff changeset
57 do {
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
58 // after the next chunk is read the offset in the buffer may change
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
59 int pos = m_bufferOffset + m_tokenLength;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
60
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
61 while(pos < m_bufferSize) {
175
cin
parents: 174
diff changeset
62 var ch = m_buffer[pos];
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
63
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
64 state = dfa[state,ch > maxSymbol ? DFAConst.UNCLASSIFIED_INPUT : alphabet[ch]];
175
cin
parents: 174
diff changeset
65 if (state == DFAConst.UNREACHABLE_STATE)
cin
parents: 174
diff changeset
66 break;
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
67
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
68 pos++;
175
cin
parents: 174
diff changeset
69 }
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
70
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
71 m_tokenLength = pos - m_bufferOffset;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
72 } while (state != DFAConst.UNREACHABLE_STATE && Feed());
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
73
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
74 m_tokenOffset = m_bufferOffset;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
75 m_bufferOffset += m_tokenLength;
174
cin
parents: 173
diff changeset
76
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
77 if (final[state]) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
78 tag = tags[state];
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
79 return true;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
80 } else {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
81 if (m_bufferOffset == m_bufferSize) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
82 if (m_tokenLength == 0) //EOF
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
83 return false;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
84
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
85 throw new ParserException();
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
86 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
87 throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset]));
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
88
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
89 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
90 }
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
91
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
92 protected void Feed(char[] buffer, int offset, int length) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
93 m_buffer = buffer;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
94 m_bufferOffset = offset;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
95 m_bufferSize = offset + length;
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
96 }
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
97
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
98 protected bool Feed() {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
99 if (m_chunkSize <= 0)
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
100 return false;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
101
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
102 if (m_buffer != null) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
103 var free = m_buffer.Length - m_bufferSize;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
104
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
105 if (free < m_chunkSize) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
106 free += m_chunkSize;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
107 var used = m_bufferSize - m_bufferOffset;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
108 var size = used + free;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
109
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
110 if (size > m_bufferMax)
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
111 throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached"), m_bufferMax/1024);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
112
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
113 var temp = new char[size];
175
cin
parents: 174
diff changeset
114
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
115 var read = Read(temp, used, m_chunkSize);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
116 if (read == 0)
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
117 return false;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
118
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
119 Array.Copy(m_buffer, m_bufferOffset, temp, 0, used);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
120
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
121 m_bufferOffset = 0;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
122 m_bufferSize = used + read;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
123 m_buffer = temp;
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
124 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
125 } else {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
126 Debug.Assert(m_bufferOffset == 0);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
127 m_buffer = new char[m_chunkSize];
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
128 m_bufferSize = Read(m_buffer, 0, m_chunkSize);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
129 return (m_bufferSize != 0);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
130 }
175
cin
parents: 174
diff changeset
131 }
cin
parents: 174
diff changeset
132
cin
parents: 174
diff changeset
133 protected abstract int Read(char[] buffer, int offset, int size);
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
134
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
135 public string GetTokenValue() {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
136 return new String(m_buffer, m_tokenOffset, m_tokenLength);
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
137 }
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
138
176
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
139 public void CopyTokenTo(char[] buffer, int offset) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
140 m_buffer.CopyTo(buffer, offset);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
141 }
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
142
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
143 public void CopyTokenTo(StringBuilder sb) {
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
144 sb.Append(m_buffer, m_tokenOffset, m_tokenLength);
0c3c69fe225b rewritten the text scanner
cin
parents: 175
diff changeset
145 }
175
cin
parents: 174
diff changeset
146
173
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
147 }
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
148 }
ecfece82ca11 Working on text scanner
cin
parents:
diff changeset
149