annotate Implab/Automaton/RegularExpressions/RegularExpressionVisitor.cs @ 236:302ca905c19e v2

JsonReader optimizations
author cin
date Tue, 21 Nov 2017 14:57:58 +0300
parents d5c5db0335ee
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
1 using Implab;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
2 using System;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
3 using System.Collections.Generic;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
4 using System.Diagnostics;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
5 using System.Linq;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
6
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
7 namespace Implab.Automaton.RegularExpressions {
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
8 /// <summary>
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
9 /// Используется для построения ДКА по регулярному выражению, сначала обходит
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
10 /// регулярное выражение и вычисляет followpos, затем используется метод
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
11 /// <see cref="BuildDFA(IDFADefinition)"/> для построения автомата.
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
12 /// </summary>
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
13 public class RegularExpressionVisitor : IVisitor {
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
14 int m_idx;
177
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
15 Token m_root;
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
16 HashSet<int> m_firstpos;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
17 HashSet<int> m_lastpos;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
18
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
19 readonly Dictionary<int, HashSet<int>> m_followpos = new Dictionary<int, HashSet<int>>();
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
20 readonly Dictionary<int, int> m_indexes = new Dictionary<int, int>();
177
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
21 readonly HashSet<int> m_ends = new HashSet<int>();
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
22
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
23 readonly IDFATableBuilder m_builder;
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
24 readonly IAlphabetBuilder<HashSet<int>> m_states = new MapAlphabet<HashSet<int>>(
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
25 false,
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
26 new CustomEqualityComparer<HashSet<int>>(
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
27 (x, y) => x.SetEquals(y),
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
28 x => x.Sum(n => n.GetHashCode())
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
29 )
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
30 );
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
31
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
32 public RegularExpressionVisitor(IDFATableBuilder builder) {
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
33 Safe.ArgumentNotNull(builder, "builder");
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
34
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
35 m_builder = builder;
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
36 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
37
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
38 HashSet<int> Followpos(int pos) {
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
39 HashSet<int> set;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
40 return m_followpos.TryGetValue(pos, out set) ? set : m_followpos[pos] = new HashSet<int>();
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
41 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
42
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
43 bool Nullable(object n) {
177
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
44 if (n is EmptyToken || n is StarToken)
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
45 return true;
177
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
46 var altToken = n as AltToken;
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
47 if (altToken != null)
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
48 return Nullable(altToken.Left) || Nullable(altToken.Right);
177
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
49 var catToken = n as CatToken;
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
50 if (catToken != null)
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
51 return Nullable(catToken.Left) && Nullable(catToken.Right);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
52 return false;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
53 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
54
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
55 protected int Index {
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
56 get { return m_idx; }
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
57 }
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
58
177
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
59 public void Visit(AltToken token) {
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
60 if (m_root == null)
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
61 m_root = token;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
62 var firtspos = new HashSet<int>();
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
63 var lastpos = new HashSet<int>();
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
64
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
65 token.Left.Accept(this);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
66 firtspos.UnionWith(m_firstpos);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
67 lastpos.UnionWith(m_lastpos);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
68
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
69 token.Right.Accept(this);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
70 firtspos.UnionWith(m_firstpos);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
71 lastpos.UnionWith(m_lastpos);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
72
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
73 m_firstpos = firtspos;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
74 m_lastpos = lastpos;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
75 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
76
177
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
77 public void Visit(StarToken token) {
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
78 if (m_root == null)
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
79 m_root = token;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
80 token.Token.Accept(this);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
81
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
82 foreach (var i in m_lastpos)
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
83 Followpos(i).UnionWith(m_firstpos);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
84 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
85
177
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
86 public void Visit(CatToken token) {
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
87 if (m_root == null)
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
88 m_root = token;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
89
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
90 var firtspos = new HashSet<int>();
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
91 var lastpos = new HashSet<int>();
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
92 token.Left.Accept(this);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
93 firtspos.UnionWith(m_firstpos);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
94 var leftLastpos = m_lastpos;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
95
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
96 token.Right.Accept(this);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
97 lastpos.UnionWith(m_lastpos);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
98 var rightFirstpos = m_firstpos;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
99
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
100 if (Nullable(token.Left))
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
101 firtspos.UnionWith(rightFirstpos);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
102
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
103 if (Nullable(token.Right))
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
104 lastpos.UnionWith(leftLastpos);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
105
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
106 m_firstpos = firtspos;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
107 m_lastpos = lastpos;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
108
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
109 foreach (var i in leftLastpos)
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
110 Followpos(i).UnionWith(rightFirstpos);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
111
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
112 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
113
177
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
114 public void Visit(EmptyToken token) {
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
115 if (m_root == null)
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
116 m_root = token;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
117 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
118
177
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
119 public void Visit(SymbolToken token) {
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
120 if (m_root == null)
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
121 m_root = token;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
122 m_idx++;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
123 m_indexes[m_idx] = token.Value;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
124 m_firstpos = new HashSet<int>(new[] { m_idx });
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
125 m_lastpos = new HashSet<int>(new[] { m_idx });
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
126 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
127
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
128 public virtual void Visit(EndToken token) {
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
129 if (m_root == null)
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
130 m_root = token;
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
131 m_idx++;
236
302ca905c19e JsonReader optimizations
cin
parents: 178
diff changeset
132 m_indexes[m_idx] = AutomatonConst.UnclassifiedInput;
177
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
133 m_firstpos = new HashSet<int>(new[] { m_idx });
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
134 m_lastpos = new HashSet<int>(new[] { m_idx });
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
135 Followpos(m_idx);
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
136 m_ends.Add(m_idx);
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
137 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
138
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
139 public void BuildDFA() {
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
140 AddState(m_firstpos);
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
141 SetInitialState(m_firstpos);
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
142
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
143 if(IsFinal(m_firstpos))
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
144 MarkFinalState(m_firstpos);
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
145
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
146 var inputMax = m_indexes.Values.Max();
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
147 var queue = new Queue<HashSet<int>>();
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
148
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
149 queue.Enqueue(m_firstpos);
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
150
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
151 while (queue.Count > 0) {
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
152 var s1 = queue.Dequeue();
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
153
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
154 for (int a = 0; a <= inputMax; a++) {
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
155 var s2 = new HashSet<int>();
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
156 foreach (var p in s1) {
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
157 if (m_indexes[p] == a) {
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
158 s2.UnionWith(Followpos(p));
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
159 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
160 }
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
161 if (s2.Count > 0) {
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
162 if (!HasState(s2)) {
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
163 AddState(s2);
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
164 if (IsFinal(s2))
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
165 MarkFinalState(s2);
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
166
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
167 queue.Enqueue(s2);
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
168 }
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
169
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
170 DefineTransition(s1, s2, a);
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
171 }
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
172
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
173 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
174 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
175 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
176
178
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
177 protected bool HasState(HashSet<int> state) {
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
178 return m_states.Contains(state);
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
179 }
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
180
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
181 protected void AddState(HashSet<int> state) {
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
182 Debug.Assert(!HasState(state));
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
183
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
184 m_states.DefineSymbol(state);
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
185 }
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
186
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
187 protected int Translate(HashSet<int> state) {
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
188 Debug.Assert(HasState(state));
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
189
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
190 return m_states.Translate(state);
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
191 }
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
192
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
193 protected virtual void SetInitialState(HashSet<int> state) {
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
194 m_builder.SetInitialState(Translate(state));
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
195 }
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
196
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
197 protected virtual void MarkFinalState(HashSet<int> state) {
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
198 m_builder.MarkFinalState(Translate(state));
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
199 }
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
200
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
201 protected virtual void DefineTransition(HashSet<int> s1, HashSet<int> s2, int ch) {
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
202
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
203 m_builder.Add(new AutomatonTransition(Translate(s1), Translate(s2), ch));
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
204 }
d5c5db0335ee working on JSON parser
cin
parents: 177
diff changeset
205
177
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
206 bool IsFinal(IEnumerable<int> state) {
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
207 Debug.Assert(state != null);
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
208 return state.Any(m_ends.Contains);
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
209 }
a0ff6a0e9c44 refactoring
cin
parents: 172
diff changeset
210
172
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
211 }
92d5278d1b10 Working on text scanner
cin
parents:
diff changeset
212 }