comparison Implab/Formats/TextScanner.cs @ 176:0c3c69fe225b ref20160224

rewritten the text scanner
author cin
date Tue, 22 Mar 2016 18:58:40 +0300
parents 96a89dcb4060
children a0ff6a0e9c44
comparison
equal deleted inserted replaced
175:96a89dcb4060 176:0c3c69fe225b
1 using System; 1 using System;
2 using Implab.Components; 2 using Implab.Components;
3 using Implab.Automaton.RegularExpressions; 3 using Implab.Automaton.RegularExpressions;
4 using System.Diagnostics; 4 using System.Diagnostics;
5 using Implab.Automaton; 5 using Implab.Automaton;
6 using System.IO;
7 using System.Text;
6 8
7 namespace Implab.Formats { 9 namespace Implab.Formats {
8 public abstract class TextScanner<TTag> : Disposable { 10 public abstract class TextScanner : Disposable {
11 readonly int m_bufferMax;
12 readonly int m_chunkSize;
9 13
10 int m_maxSymbol; 14 char[] m_buffer;
11 int[] m_symbolMap;
12
13 readonly char[] m_buffer;
14 int m_bufferOffset; 15 int m_bufferOffset;
15 int m_bufferSize; 16 int m_bufferSize;
17 int m_tokenOffset;
16 int m_tokenLength; 18 int m_tokenLength;
17 19
18 TTag[] m_tags; 20 /// <summary>
21 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class.
22 /// </summary>
23 /// <param name="bufferMax">Buffer max.</param>
24 /// <param name="chunkSize">Chunk size.</param>
25 protected TextScanner(int bufferMax, int chunkSize) {
26 Debug.Assert(m_chunkSize <= m_bufferMax);
19 27
20 protected bool ReadTokenInternal(DFAStateDescriptor<TTag>[] dfa, int state) { 28 m_bufferMax = bufferMax;
21 Debug.Assert(dfa != null); 29 m_chunkSize = chunkSize;
30 }
31
32 /// <summary>
33 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class.
34 /// </summary>
35 /// <param name="buffer">Buffer.</param>
36 protected TextScanner(char[] buffer) {
37 if (buffer != null) {
38 m_buffer = buffer;
39 m_bufferSize = buffer.Length;
40 }
41 }
42
43 /// <summary>
44 /// (hungry) Reads the next token.
45 /// </summary>
46 /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns>
47 /// <param name="dfa">The transition map for the automaton</param>
48 /// <param name="final">Final states of the automaton.</param>
49 /// <param name="tags">Tags.</param>
50 /// <param name="state">The initial state for the automaton.</param>
51 internal bool ReadToken<TTag>(int[,] dfa, int[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) {
52 Safe.ArgumentNotNull();
53 m_tokenLength = 0;
54
55 var maxSymbol = alphabet.Length - 1;
22 56
23 do { 57 do {
24 for (var pos = m_bufferOffset; pos < m_bufferSize; pos++) { 58 // after the next chunk is read the offset in the buffer may change
59 int pos = m_bufferOffset + m_tokenLength;
60
61 while(pos < m_bufferSize) {
25 var ch = m_buffer[pos]; 62 var ch = m_buffer[pos];
26 state = dfa[state].transitions[m_symbolMap[ch > m_maxSymbol ? m_maxSymbol : ch]]; 63
64 state = dfa[state,ch > maxSymbol ? DFAConst.UNCLASSIFIED_INPUT : alphabet[ch]];
27 if (state == DFAConst.UNREACHABLE_STATE) 65 if (state == DFAConst.UNREACHABLE_STATE)
28 break; 66 break;
67
68 pos++;
29 } 69 }
30 } while (Feed());
31 70
32 if (dfa[state].final) { 71 m_tokenLength = pos - m_bufferOffset;
72 } while (state != DFAConst.UNREACHABLE_STATE && Feed());
33 73
74 m_tokenOffset = m_bufferOffset;
75 m_bufferOffset += m_tokenLength;
76
77 if (final[state]) {
78 tag = tags[state];
79 return true;
80 } else {
81 if (m_bufferOffset == m_bufferSize) {
82 if (m_tokenLength == 0) //EOF
83 return false;
84
85 throw new ParserException();
86 }
87 throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset]));
88
34 } 89 }
35
36 } 90 }
37 91
38 bool Feed() { 92 protected void Feed(char[] buffer, int offset, int length) {
93 m_buffer = buffer;
94 m_bufferOffset = offset;
95 m_bufferSize = offset + length;
96 }
39 97
98 protected bool Feed() {
99 if (m_chunkSize <= 0)
100 return false;
101
102 if (m_buffer != null) {
103 var free = m_buffer.Length - m_bufferSize;
104
105 if (free < m_chunkSize) {
106 free += m_chunkSize;
107 var used = m_bufferSize - m_bufferOffset;
108 var size = used + free;
109
110 if (size > m_bufferMax)
111 throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached"), m_bufferMax/1024);
112
113 var temp = new char[size];
114
115 var read = Read(temp, used, m_chunkSize);
116 if (read == 0)
117 return false;
118
119 Array.Copy(m_buffer, m_bufferOffset, temp, 0, used);
120
121 m_bufferOffset = 0;
122 m_bufferSize = used + read;
123 m_buffer = temp;
124 }
125 } else {
126 Debug.Assert(m_bufferOffset == 0);
127 m_buffer = new char[m_chunkSize];
128 m_bufferSize = Read(m_buffer, 0, m_chunkSize);
129 return (m_bufferSize != 0);
130 }
40 } 131 }
41 132
42 protected abstract int Read(char[] buffer, int offset, int size); 133 protected abstract int Read(char[] buffer, int offset, int size);
43 134
44 protected TTag[] Tags { 135 public string GetTokenValue() {
45 get { 136 return new String(m_buffer, m_tokenOffset, m_tokenLength);
46 return m_tags;
47 }
48 } 137 }
49 138
139 public void CopyTokenTo(char[] buffer, int offset) {
140 m_buffer.CopyTo(buffer, offset);
141 }
142
143 public void CopyTokenTo(StringBuilder sb) {
144 sb.Append(m_buffer, m_tokenOffset, m_tokenLength);
145 }
50 146
51 } 147 }
52 } 148 }
53 149