changeset 172:92d5278d1b10 ref20160224

Working on text scanner
author cin
date Mon, 14 Mar 2016 01:19:38 +0300 (2016-03-13)
parents 0f70905b4652
children ecfece82ca11
files Implab/Automaton/DFAStateDescriptor.cs Implab/Automaton/DFATable.cs Implab/Automaton/IndexedAlphabetBase.cs Implab/Automaton/MapAlphabet.cs Implab/Automaton/RegularExpressions/DFAStateDescriptorT.cs Implab/Automaton/RegularExpressions/Grammar.cs Implab/Automaton/RegularExpressions/ITaggedDFABuilder.cs Implab/Automaton/RegularExpressions/RegularDFA.cs Implab/Automaton/RegularExpressions/RegularDFABuilder.cs Implab/Automaton/RegularExpressions/RegularDFADefinition.cs Implab/Automaton/RegularExpressions/RegularExpressionVisitor.cs Implab/Automaton/Scanner.cs Implab/Formats/ByteAlphabet.cs Implab/Formats/CharAlphabet.cs Implab/Formats/JSON/JSONGrammar.cs Implab/Formats/JSON/JSONParser.cs Implab/Formats/RegularCharDFADefinition.cs Implab/Implab.csproj
diffstat 18 files changed, 376 insertions(+), 339 deletions(-) [+]
line wrap: on
line diff
--- a/Implab/Automaton/DFAStateDescriptor.cs	Thu Mar 10 01:19:33 2016 +0300
+++ b/Implab/Automaton/DFAStateDescriptor.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -1,18 +1,18 @@
 namespace Implab.Automaton {
-    public struct DFAStateDescriptior {
+    public struct DFAStateDescriptor {
         public readonly bool final;
         public readonly int[] transitions;
 
 
-        public DFAStateDescriptior(int[] transitions, bool final) {
+        public DFAStateDescriptor(int[] transitions, bool final) {
             this.transitions = transitions;
             this.final = final;
         }
 
-        public DFAStateDescriptior(int[] transitions) : this(transitions, false) {
+        public DFAStateDescriptor(int[] transitions) : this(transitions, false) {
         }
 
-        public DFAStateDescriptior(int size, bool final) {
+        public DFAStateDescriptor(int size, bool final) {
             Safe.ArgumentInRange(size, 0, int.MaxValue, "size");
 
             this.final = final;
--- a/Implab/Automaton/DFATable.cs	Thu Mar 10 01:19:33 2016 +0300
+++ b/Implab/Automaton/DFATable.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -100,6 +100,20 @@
             return GetEnumerator();
         }
 
+        public DFAStateDescriptor[] CreateTransitionTable() {
+            var table = new DFAStateDescriptor[StateCount];
+
+            foreach (var t in this) {
+                if (table[t.s1].transitions == null)
+                    table[t.s1] = new DFAStateDescriptor(AlphabetSize, IsFinalState(t.s1));
+                if (table[t.s2].transitions == null)
+                    table[t.s2] = new DFAStateDescriptor(AlphabetSize, IsFinalState(t.s2));
+                table[t.s1].transitions[t.edge] = t.s2;
+            }
+
+            return table;
+        }
+
         /// <summary>Формирует множества конечных состояний перед началом работы алгоритма минимизации.</summary>
         /// <remarks>
         /// В процессе построения минимального автомата требуется разделить множество состояний,
--- a/Implab/Automaton/IndexedAlphabetBase.cs	Thu Mar 10 01:19:33 2016 +0300
+++ b/Implab/Automaton/IndexedAlphabetBase.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -71,8 +71,16 @@
             return true;
         }
 
+        public IEnumerable<T> GetSymbols(int cls) {
+            for (var i = 0; i < m_map.Length; i++)
+                if (m_map[i] == cls)
+                    yield return GetSymbolByIndex(i);
+        }
+
         public abstract int GetSymbolIndex(T symbol);
 
+        public abstract T GetSymbolByIndex(int index);
+
         public abstract IEnumerable<T> InputSymbols { get; }
 
         /// <summary>
--- a/Implab/Automaton/MapAlphabet.cs	Thu Mar 10 01:19:33 2016 +0300
+++ b/Implab/Automaton/MapAlphabet.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -38,7 +38,6 @@
             Safe.ArgumentNotNull(symbols, "symbols");
 
             m_nextCls = Math.Max(cls + 1, m_nextCls);
-            symbols = symbols.Distinct();
 
             foreach (var symbol in symbols)
                 m_map[symbol] = cls;
@@ -68,6 +67,10 @@
             return m_supportUnclassified || m_map.ContainsKey(symbol);
         }
 
+
+        public IEnumerable<T> GetSymbols(int cls) {
+            return m_map.Where(p => p.Value == cls).Select(p => p.Key);
+        }
         #endregion
     }
 }
--- a/Implab/Automaton/RegularExpressions/DFAStateDescriptorT.cs	Thu Mar 10 01:19:33 2016 +0300
+++ b/Implab/Automaton/RegularExpressions/DFAStateDescriptorT.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -1,14 +1,14 @@
 using System;
 
 namespace Implab.Automaton.RegularExpressions {
-    public struct DFAStateDescriptorT<T> {
+    public struct DFAStateDescriptor<T> {
         public readonly bool final;
 
         public readonly int[] transitions;
 
         public readonly T[] tags;
 
-        public DFAStateDescriptorT(int size, bool final, T[] tags) {
+        public DFAStateDescriptor(int size, bool final, T[] tags) {
             Safe.ArgumentAssert(size >= 0, "size");
             this.final = final;
             this.tags = tags;
--- a/Implab/Automaton/RegularExpressions/Grammar.cs	Thu Mar 10 01:19:33 2016 +0300
+++ b/Implab/Automaton/RegularExpressions/Grammar.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -66,23 +66,21 @@
             return Token<TTag>.New( Enumerable.Range(0, AlphabetBuilder.Count).Except(TranslateOrDie(symbols)).ToArray() );
         }
 
-        protected void BuildDFA(Token<TTag> lang, IDFATableBuilder<TTag> dfaTable, IAlphabetBuilder<TSymbol> dfaAlphabet) {
-            Safe.ArgumentNotNull(lang, "lang");
-            Safe.ArgumentNotNull(dfaAlphabet, "dfaAlphabet");
-
-            var dfa = new RegularDFADefinition<TSymbol, TTag>(AlphabetBuilder);
+        protected abstract IAlphabetBuilder<TSymbol> CreateAlphabet();
 
-            var builder = new RegularDFABuilder<TTag>();
+        protected RegularDFA<TSymbol, TTag> BuildDFA(Token<TTag> regexp) {
+            
+            var dfa = new RegularDFA<TSymbol, TTag>(AlphabetBuilder);
 
-            lang.Accept( builder );
+            var visitor = new RegularExpressionVisitor<TTag>();
+            regexp.Accept( visitor );
 
-            builder.BuildDFA(dfa);
+            visitor.BuildDFA(dfa);
 
             if (dfa.IsFinalState(dfa.InitialState))
                 throw new ApplicationException("The specified language contains empty token");
 
-            dfa.Optimize(dfaTable, dfaAlphabet);
-
+            return dfa.Optimize(CreateAlphabet());
         }
 
     }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Implab/Automaton/RegularExpressions/ITaggedDFABuilder.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -0,0 +1,8 @@
+using System;
+
+namespace Implab.Automaton.RegularExpressions {
+    public interface ITaggedDFABuilder<TTag> : IDFATableBuilder {
+        void SetStateTag(int s, TTag[] tags);
+    }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Implab/Automaton/RegularExpressions/RegularDFA.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -0,0 +1,89 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace Implab.Automaton.RegularExpressions {
+    public class RegularDFA<TInput, TTag> : DFATable, ITaggedDFABuilder<TTag> {
+
+        readonly Dictionary<int,TTag[]> m_tags = new Dictionary<int, TTag[]>();
+        readonly IAlphabet<TInput> m_alphabet;
+
+        public RegularDFA(IAlphabet<TInput> alphabet) {
+            Safe.ArgumentNotNull(alphabet, "aplhabet");
+
+            m_alphabet = alphabet;
+        }
+
+
+        public IAlphabet<TInput> InputAlphabet {
+            get {
+                return m_alphabet;
+            }
+        }
+
+        public void MarkFinalState(int s, TTag[] tags) {
+            MarkFinalState(s);
+            SetStateTag(s, tags);
+        }
+
+        public void SetStateTag(int s, TTag[] tags) {
+            Safe.ArgumentNotNull(tags, "tags");
+            m_tags[s] = tags;
+        }
+
+        public TTag[] GetStateTag(int s) {
+            TTag[] tags;
+            return m_tags.TryGetValue(s, out tags) ? tags : new TTag[0];
+        }
+
+        public new DFAStateDescriptor<TTag>[] CreateTransitionTable() {
+            var table = new DFAStateDescriptor<TTag>[StateCount];
+
+            foreach (var t in this) {
+                if (table[t.s1].transitions == null)
+                    table[t.s1] = new DFAStateDescriptor<TTag>(AlphabetSize, IsFinalState(t.s1), GetStateTag(t.s1));
+                if (table[t.s2].transitions == null)
+                    table[t.s2] = new DFAStateDescriptor<TTag>(AlphabetSize, IsFinalState(t.s2), GetStateTag(t.s2));
+                table[t.s1].transitions[t.edge] = t.s2;
+            }
+
+            return table;
+        }
+
+        /// <summary>
+        /// Optimize the specified alphabet.
+        /// </summary>
+        /// <param name="alphabet">Пустой алфавит, который будет зполнен в процессе оптимизации.</param>
+        public RegularDFA<TInput,TTag> Optimize(IAlphabetBuilder<TInput> alphabet) {
+            Safe.ArgumentNotNull(alphabet, "alphabet");
+
+            var dfa = new RegularDFA<TInput, TTag>(alphabet);
+
+            var states = new DummyAlphabet(StateCount);
+            var alphaMap = new Dictionary<int,int>();
+            var stateMap = new Dictionary<int,int>();
+
+            Optimize(dfa, alphaMap, stateMap); 
+
+            // mark tags in the new DFA
+            foreach (var g in m_tags.Where(x => x.Key < StateCount).GroupBy(x => stateMap[x.Key], x => x.Value ))
+                dfa.SetStateTag(g.Key, g.SelectMany(x => x).ToArray());
+
+            // make the alphabet for the new DFA
+            foreach (var pair in alphaMap)
+                alphabet.DefineClass(m_alphabet.GetSymbols(pair.Key), pair.Value);
+            
+            return dfa;
+        }
+
+        protected override IEnumerable<HashSet<int>> GroupFinalStates() {
+            var arrayComparer = new CustomEqualityComparer<TTag[]>(
+                (x,y) => x.Length == y.Length && x.All(it => y.Contains(it)),
+                x => x.Sum(it => x.GetHashCode())
+            );
+            return FinalStates.GroupBy(x => m_tags[x], arrayComparer).Select(g => new HashSet<int>(g));
+        }
+
+    }
+}
+
--- a/Implab/Automaton/RegularExpressions/RegularDFABuilder.cs	Thu Mar 10 01:19:33 2016 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,180 +0,0 @@
-using Implab;
-using System;
-using System.Collections.Generic;
-using System.Diagnostics;
-using System.Linq;
-
-namespace Implab.Automaton.RegularExpressions {
-    /// <summary>
-    /// Используется для построения ДКА по регулярному выражению, сначала обходит
-    /// регулярное выражение и вычисляет followpos, затем используется метод
-    /// <see cref="BuildDFA(IDFADefinition)"/> для построения автомата.
-    /// </summary>
-    public class RegularDFABuilder<TTag> : IVisitor<TTag> {
-        int m_idx;
-        Token<TTag> m_root;
-        HashSet<int> m_firstpos;
-        HashSet<int> m_lastpos;
-
-        readonly Dictionary<int, HashSet<int>> m_followpos = new Dictionary<int, HashSet<int>>();
-        readonly Dictionary<int, int> m_indexes = new Dictionary<int, int>();
-        readonly Dictionary<int, TTag> m_ends = new Dictionary<int, TTag>();
-
-        public Dictionary<int, HashSet<int>> FollowposMap {
-            get { return m_followpos; }
-        }
-
-        public HashSet<int> Followpos(int pos) {
-            HashSet<int> set;
-            return m_followpos.TryGetValue(pos, out set) ? set : m_followpos[pos] = new HashSet<int>();
-        }
-
-        bool Nullable(object n) {
-            if (n is EmptyToken<TTag> || n is StarToken<TTag>)
-                return true;
-            var altToken = n as AltToken<TTag>;
-            if (altToken != null)
-                return Nullable(altToken.Left) || Nullable(altToken.Right);
-            var catToken = n as CatToken<TTag>;
-            if (catToken != null)
-                return Nullable(catToken.Left) && Nullable(catToken.Right);
-            return false;
-        }
-
-
-        public void Visit(AltToken<TTag> token) {
-            if (m_root == null)
-                m_root = token;
-            var firtspos = new HashSet<int>();
-            var lastpos = new HashSet<int>();
-
-            token.Left.Accept(this);
-            firtspos.UnionWith(m_firstpos);
-            lastpos.UnionWith(m_lastpos);
-
-            token.Right.Accept(this);
-            firtspos.UnionWith(m_firstpos);
-            lastpos.UnionWith(m_lastpos);
-
-            m_firstpos = firtspos;
-            m_lastpos = lastpos;
-        }
-
-        public void Visit(StarToken<TTag> token) {
-            if (m_root == null)
-                m_root = token;
-            token.Token.Accept(this);
-
-            foreach (var i in m_lastpos)
-                Followpos(i).UnionWith(m_firstpos);
-        }
-
-        public void Visit(CatToken<TTag> token) {
-            if (m_root == null)
-                m_root = token;
-
-            var firtspos = new HashSet<int>();
-            var lastpos = new HashSet<int>();
-            token.Left.Accept(this);
-            firtspos.UnionWith(m_firstpos);
-            var leftLastpos = m_lastpos;
-
-            token.Right.Accept(this);
-            lastpos.UnionWith(m_lastpos);
-            var rightFirstpos = m_firstpos;
-
-            if (Nullable(token.Left))
-                firtspos.UnionWith(rightFirstpos);
-
-            if (Nullable(token.Right))
-                lastpos.UnionWith(leftLastpos);
-
-            m_firstpos = firtspos;
-            m_lastpos = lastpos;
-
-            foreach (var i in leftLastpos)
-                Followpos(i).UnionWith(rightFirstpos);
-
-        }
-
-        public void Visit(EmptyToken<TTag> token) {
-            if (m_root == null)
-                m_root = token;
-        }
-
-        public void Visit(SymbolToken<TTag> token) {
-            if (m_root == null)
-                m_root = token;
-            m_idx++;
-            m_indexes[m_idx] = token.Value;
-            m_firstpos = new HashSet<int>(new[] { m_idx });
-            m_lastpos = new HashSet<int>(new[] { m_idx });
-        }
-
-        public void Visit(EndToken<TTag> token) {
-            if (m_root == null)
-                m_root = token;
-            m_idx++;
-            m_indexes[m_idx] = DFAConst.UNCLASSIFIED_INPUT;
-            m_firstpos = new HashSet<int>(new[] { m_idx });
-            m_lastpos = new HashSet<int>(new[] { m_idx });
-            Followpos(m_idx);
-            m_ends.Add(m_idx, token.Tag);
-        }
-
-        public void BuildDFA(IDFATableBuilder dfa) {
-            Safe.ArgumentNotNull(dfa,"dfa");
-
-            var states = new MapAlphabet<HashSet<int>>(new CustomEqualityComparer<HashSet<int>>(
-                             (x, y) => x.SetEquals(y),
-                             x => x.Sum(n => n.GetHashCode())
-                         ));
-
-            var initialState = states.DefineSymbol(m_firstpos);
-            dfa.SetInitialState(initialState);
-
-            var tags = GetStateTags(m_firstpos);
-            if (tags != null && tags.Length > 0)
-                dfa.MarkFinalState(initialState, tags);
-
-            var inputMax = m_indexes.Values.Max();
-            var queue = new Queue<HashSet<int>>();
-
-            queue.Enqueue(m_firstpos);
-
-            while (queue.Count > 0) {
-                var state = queue.Dequeue();
-                var s1 = states.Translate(state);
-                Debug.Assert(s1 != DFAConst.UNCLASSIFIED_INPUT);
-
-                for (int a = 0; a <= inputMax; a++) {
-                    var next = new HashSet<int>();
-                    foreach (var p in state) {
-                        if (m_indexes[p] == a) {
-                            next.UnionWith(Followpos(p));
-                        }
-                    }
-                    if (next.Count > 0) {
-                        int s2 = states.Translate(next);
-                        if (s2 == DFAConst.UNCLASSIFIED_INPUT) {
-                            s2 = states.DefineSymbol(next);
-
-                            tags = GetStateTags(next);
-                            if (tags != null && tags.Length > 0)
-                                dfa.MarkFinalState(s2, tags);
-                            
-                            queue.Enqueue(next);
-                        }
-                        dfa.Add(new AutomatonTransition(s1, s2, a));
-                    }
-                }
-            }
-        }
-
-        TTag[] GetStateTags(IEnumerable<int> state) {
-            Debug.Assert(state != null);
-            return state.Where(m_ends.ContainsKey).Select(pos => m_ends[pos]).ToArray();
-        }
-
-    }
-}
--- a/Implab/Automaton/RegularExpressions/RegularDFADefinition.cs	Thu Mar 10 01:19:33 2016 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,69 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Linq;
-
-namespace Implab.Automaton.RegularExpressions {
-    public class RegularDFADefinition<TInput, TTag> : DFATable {
-
-        readonly Dictionary<int,TTag[]> m_tags = new Dictionary<int, TTag[]>();
-        readonly IAlphabet<TInput> m_alphabet;
-
-        public RegularDFADefinition(IAlphabet<TInput> alphabet) {
-            Safe.ArgumentNotNull(alphabet, "aplhabet");
-
-            m_alphabet = alphabet;
-        }
-
-
-        public IAlphabet<TInput> InputAlphabet {
-            get {
-                return m_alphabet;
-            }
-        }
-
-        public void MarkFinalState(int s, TTag[] tags) {
-            MarkFinalState(s);
-            SetStateTag(s, tags);
-        }
-
-        public void SetStateTag(int s, TTag[] tags) {
-            Safe.ArgumentNotNull(tags, "tags");
-            m_tags[s] = tags;
-        }
-
-        public TTag[] GetStateTag(int s) {
-            TTag[] tags;
-            return m_tags.TryGetValue(s, out tags) ? tags : new TTag[0];
-        }
-
-        /// <summary>
-        /// Optimize the specified alphabet.
-        /// </summary>
-        /// <param name="alphabet">Пустой алфавит, который будет зполнен в процессе оптимизации.</param>
-        public RegularDFADefinition<TInput,TTag> Optimize(IAlphabetBuilder<TInput> alphabet) {
-            Safe.ArgumentNotNull(alphabet, "alphabet");
-
-            var dfaTable = new RegularDFADefinition<TInput, TTag>(alphabet);
-
-            var states = new DummyAlphabet(StateCount);
-            var alphaMap = new Dictionary<int,int>();
-            var stateMap = new Dictionary<int,int>();
-            Optimize(dfaTable, alphaMap, stateMap); 
-
-            foreach (var g in m_tags.Where(x => x.Key < StateCount).GroupBy(x => stateMap[x.Key], x => x.Value ))
-                dfaTable.SetStateTag(g.Key, g.SelectMany(x => x).ToArray());
-
-            return dfaTable;
-        }
-
-        protected override IEnumerable<HashSet<int>> GroupFinalStates() {
-            var arrayComparer = new CustomEqualityComparer<TTag[]>(
-                (x,y) => x.Length == y.Length && x.All(it => y.Contains(it)),
-                x => x.Sum(it => x.GetHashCode())
-            );
-            return FinalStates.GroupBy(x => m_tags[x], arrayComparer).Select(g => new HashSet<int>(g));
-        }
-
-    }
-}
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Implab/Automaton/RegularExpressions/RegularExpressionVisitor.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -0,0 +1,184 @@
+using Implab;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Linq;
+
+namespace Implab.Automaton.RegularExpressions {
+    /// <summary>
+    /// Используется для построения ДКА по регулярному выражению, сначала обходит
+    /// регулярное выражение и вычисляет followpos, затем используется метод
+    /// <see cref="BuildDFA(IDFADefinition)"/> для построения автомата.
+    /// </summary>
+    public class RegularExpressionVisitor<TTag> : IVisitor<TTag> {
+        int m_idx;
+        Token<TTag> m_root;
+        HashSet<int> m_firstpos;
+        HashSet<int> m_lastpos;
+
+        readonly Dictionary<int, HashSet<int>> m_followpos = new Dictionary<int, HashSet<int>>();
+        readonly Dictionary<int, int> m_indexes = new Dictionary<int, int>();
+        readonly Dictionary<int, TTag> m_ends = new Dictionary<int, TTag>();
+
+        public Dictionary<int, HashSet<int>> FollowposMap {
+            get { return m_followpos; }
+        }
+
+        public HashSet<int> Followpos(int pos) {
+            HashSet<int> set;
+            return m_followpos.TryGetValue(pos, out set) ? set : m_followpos[pos] = new HashSet<int>();
+        }
+
+        bool Nullable(object n) {
+            if (n is EmptyToken<TTag> || n is StarToken<TTag>)
+                return true;
+            var altToken = n as AltToken<TTag>;
+            if (altToken != null)
+                return Nullable(altToken.Left) || Nullable(altToken.Right);
+            var catToken = n as CatToken<TTag>;
+            if (catToken != null)
+                return Nullable(catToken.Left) && Nullable(catToken.Right);
+            return false;
+        }
+
+
+        public void Visit(AltToken<TTag> token) {
+            if (m_root == null)
+                m_root = token;
+            var firtspos = new HashSet<int>();
+            var lastpos = new HashSet<int>();
+
+            token.Left.Accept(this);
+            firtspos.UnionWith(m_firstpos);
+            lastpos.UnionWith(m_lastpos);
+
+            token.Right.Accept(this);
+            firtspos.UnionWith(m_firstpos);
+            lastpos.UnionWith(m_lastpos);
+
+            m_firstpos = firtspos;
+            m_lastpos = lastpos;
+        }
+
+        public void Visit(StarToken<TTag> token) {
+            if (m_root == null)
+                m_root = token;
+            token.Token.Accept(this);
+
+            foreach (var i in m_lastpos)
+                Followpos(i).UnionWith(m_firstpos);
+        }
+
+        public void Visit(CatToken<TTag> token) {
+            if (m_root == null)
+                m_root = token;
+
+            var firtspos = new HashSet<int>();
+            var lastpos = new HashSet<int>();
+            token.Left.Accept(this);
+            firtspos.UnionWith(m_firstpos);
+            var leftLastpos = m_lastpos;
+
+            token.Right.Accept(this);
+            lastpos.UnionWith(m_lastpos);
+            var rightFirstpos = m_firstpos;
+
+            if (Nullable(token.Left))
+                firtspos.UnionWith(rightFirstpos);
+
+            if (Nullable(token.Right))
+                lastpos.UnionWith(leftLastpos);
+
+            m_firstpos = firtspos;
+            m_lastpos = lastpos;
+
+            foreach (var i in leftLastpos)
+                Followpos(i).UnionWith(rightFirstpos);
+
+        }
+
+        public void Visit(EmptyToken<TTag> token) {
+            if (m_root == null)
+                m_root = token;
+        }
+
+        public void Visit(SymbolToken<TTag> token) {
+            if (m_root == null)
+                m_root = token;
+            m_idx++;
+            m_indexes[m_idx] = token.Value;
+            m_firstpos = new HashSet<int>(new[] { m_idx });
+            m_lastpos = new HashSet<int>(new[] { m_idx });
+        }
+
+        public void Visit(EndToken<TTag> token) {
+            if (m_root == null)
+                m_root = token;
+            m_idx++;
+            m_indexes[m_idx] = DFAConst.UNCLASSIFIED_INPUT;
+            m_firstpos = new HashSet<int>(new[] { m_idx });
+            m_lastpos = new HashSet<int>(new[] { m_idx });
+            Followpos(m_idx);
+            m_ends.Add(m_idx, token.Tag);
+        }
+
+        public void BuildDFA(ITaggedDFABuilder<TTag> dfa) {
+            Safe.ArgumentNotNull(dfa,"dfa");
+
+            var states = new MapAlphabet<HashSet<int>>(
+                             false,
+                             new CustomEqualityComparer<HashSet<int>>(
+                                 (x, y) => x.SetEquals(y),
+                                 x => x.Sum(n => n.GetHashCode())
+                             ));
+
+            var initialState = states.DefineSymbol(m_firstpos);
+            dfa.SetInitialState(initialState);
+
+            var tags = GetStateTags(m_firstpos);
+            if (tags != null && tags.Length > 0)
+                dfa.MarkFinalState(initialState, tags);
+
+            var inputMax = m_indexes.Values.Max();
+            var queue = new Queue<HashSet<int>>();
+
+            queue.Enqueue(m_firstpos);
+
+            while (queue.Count > 0) {
+                var state = queue.Dequeue();
+                var s1 = states.Translate(state);
+                Debug.Assert(s1 != DFAConst.UNCLASSIFIED_INPUT);
+
+                for (int a = 0; a <= inputMax; a++) {
+                    var next = new HashSet<int>();
+                    foreach (var p in state) {
+                        if (m_indexes[p] == a) {
+                            next.UnionWith(Followpos(p));
+                        }
+                    }
+                    if (next.Count > 0) {
+                        int s2 = states.Translate(next);
+                        if (s2 == DFAConst.UNCLASSIFIED_INPUT) {
+                            s2 = states.DefineSymbol(next);
+
+                            tags = GetStateTags(next);
+                            if (tags != null && tags.Length > 0) {
+                                dfa.MarkFinalState(s2);
+                                dfa.SetStateTag(s2, tags);
+                            }
+                            
+                            queue.Enqueue(next);
+                        }
+                        dfa.Add(new AutomatonTransition(s1, s2, a));
+                    }
+                }
+            }
+        }
+
+        TTag[] GetStateTags(IEnumerable<int> state) {
+            Debug.Assert(state != null);
+            return state.Where(m_ends.ContainsKey).Select(pos => m_ends[pos]).ToArray();
+        }
+
+    }
+}
--- a/Implab/Automaton/Scanner.cs	Thu Mar 10 01:19:33 2016 +0300
+++ b/Implab/Automaton/Scanner.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -3,6 +3,7 @@
 using System.Collections.Generic;
 using System.IO;
 using Implab.Components;
+using Implab.Automaton.RegularExpressions;
 
 namespace Implab.Automaton {
     /// <summary>
@@ -14,19 +15,23 @@
     /// конца токена и допустимости текущего символа.
     /// </remarks>
     public abstract class Scanner<TTag> : Disposable {
-        struct ScannerConfig {
-            public DFAStateDescriptior<TTag>[] states;
-            public int[] alphabetMap;
-            public int initialState;
+        protected struct ScannerConfig {
+            public readonly DFAStateDescriptor<TTag>[] states;
+            public readonly int[] alphabet;
+            public readonly int initialState;
+
+            public ScannerConfig(DFAStateDescriptor<TTag>[] states, int[] alphabet, int initialState) {
+                this.initialState = initialState;
+                this.alphabet = alphabet;
+                this.states = states;
+            }
         }
 
         Stack<ScannerConfig> m_defs = new Stack<ScannerConfig>();
 
-        DFAStateDescriptior<TTag>[] m_states;
-        int[] m_alphabetMap;
-        int m_initialState;
+        ScannerConfig m_config;
 
-        protected DFAStateDescriptior<TTag> m_currentState;
+        protected DFAStateDescriptor<TTag> m_currentState;
         int m_previewCode;
 
         protected int m_tokenLen;
@@ -41,15 +46,11 @@
         int m_chunkSize = 1024; // 1k
         int m_limit = 10 * 1024 * 1024; // 10Mb
 
-        protected Scanner(DFAStateDescriptior<TTag>[] states, int[] alphabet, int initialState) {
-            Safe.ArgumentNotEmpty(states, "states");
-            Safe.ArgumentNotNull(alphabet, "alphabet");
+        protected Scanner(ScannerConfig config) {
+            Safe.ArgumentNotEmpty(config.states, "config.states");
+            Safe.ArgumentNotNull(config.alphabet, "config.alphabet");
 
-            m_states = states;
-            m_alphabetMap = alphabet;
-            m_initialState = initialState;
-
-            Feed(new char[0]);
+            m_config = config;
         }
 
         /// <summary>
@@ -110,7 +111,7 @@
         /// </summary>
         protected TTag[] TokenTags {
             get {
-                return m_currentState.tag;
+                return m_currentState.tags;
             }
         }
 
@@ -133,7 +134,7 @@
             if (m_pointer >= m_bufferSize)
                 return false;
 
-            m_currentState = m_states[m_initialState];
+            m_currentState = m_config.states[m_config.initialState];
             m_tokenLen = 0;
             m_tokenOffset = m_pointer;
             int nextState;
@@ -151,7 +152,7 @@
                         )
                     );
                 }
-                m_currentState = m_states[nextState];
+                m_currentState = m_config.states[nextState];
                 m_tokenLen++;
 
             } while (Shift());
@@ -172,7 +173,7 @@
                     return false;
             }
 
-            m_previewCode = m_alphabetMap[m_buffer[m_pointer]];
+            m_previewCode = m_config.alphabet[m_buffer[m_pointer]];
 
             return true;
         }
@@ -217,23 +218,14 @@
         /// Преключает внутренний ДКА на указанный, позволяет реализовать подобие захватывающей
         /// группировки.
         /// </summary>
-        /// <param name="states">Таблица состояний нового ДКА</param>
-        /// <param name="alphabet">Таблица входных символов для нового ДКА</param>
-        /// <param name = "initialState"></param>
-        protected void Switch(DFAStateDescriptior<TTag>[] states, int[] alphabet, int initialState) {
-            Safe.ArgumentNotNull(states, "dfa");
+        /// <param name = "config"></param>
+        protected void Switch(ScannerConfig config) {
+            Safe.ArgumentNotNull(config.states, "config.states");
 
-            m_defs.Push(new ScannerConfig {
-                states = m_states,
-                alphabetMap = m_alphabetMap,
-                initialState = m_initialState
-            });
+            m_defs.Push(m_config);
+            m_config = config;
 
-            m_states = states;
-            m_alphabetMap = alphabet;
-            m_initialState = initialState;
-
-            m_previewCode = m_alphabetMap[m_buffer[m_pointer]];
+            m_previewCode = m_config.alphabet[m_buffer[m_pointer]];
         }
 
         /// <summary>
@@ -242,11 +234,9 @@
         protected void Restore() {
             if (m_defs.Count == 0)
                 throw new InvalidOperationException();
-            var prev = m_defs.Pop();
-            m_states = prev.states;
-            m_alphabetMap = prev.alphabetMap;
-            m_initialState = prev.initialState;
-            m_previewCode = m_alphabetMap[m_buffer[m_pointer]];
+            m_config = m_defs.Pop();
+
+            m_previewCode = m_config.alphabet[m_buffer[m_pointer]];
         }
 
         protected override void Dispose(bool disposing) {
--- a/Implab/Formats/ByteAlphabet.cs	Thu Mar 10 01:19:33 2016 +0300
+++ b/Implab/Formats/ByteAlphabet.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -13,6 +13,10 @@
             return (int)symbol;
         }
 
+        public override byte GetSymbolByIndex(int index) {
+            return (byte)index;
+        }
+
         public IEnumerable<byte> InputSymbols {
             get {
                 return Enumerable.Range(byte.MinValue, byte.MaxValue).Cast<byte>();
--- a/Implab/Formats/CharAlphabet.cs	Thu Mar 10 01:19:33 2016 +0300
+++ b/Implab/Formats/CharAlphabet.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -13,6 +13,10 @@
             return symbol;
         }
 
+        public override char GetSymbolByIndex(int index) {
+            return (char)index;
+        }
+
         public override IEnumerable<char> InputSymbols {
             get { return Enumerable.Range(char.MinValue, char.MaxValue).Cast<char>(); }
         }
--- a/Implab/Formats/JSON/JSONGrammar.cs	Thu Mar 10 01:19:33 2016 +0300
+++ b/Implab/Formats/JSON/JSONGrammar.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -1,6 +1,7 @@
 using System.Linq;
 using Implab.Automaton.RegularExpressions;
 using System;
+using Implab.Automaton;
 
 namespace Implab.Formats.JSON {
     class JSONGrammar : Grammar<char,JSONGrammar.TokenType> {
@@ -35,8 +36,8 @@
             get { return _instance.Value; }
         }
 
-        readonly RegularCharDFADefinition<TokenType> m_jsonDFA;
-        readonly RegularCharDFADefinition<TokenType> m_stringDFA;
+        readonly RegularDFA<char, TokenType> m_jsonDFA;
+        readonly RegularDFA<char, TokenType> m_stringDFA;
 
         public JSONGrammar() {
             DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x));
@@ -87,21 +88,17 @@
                 .Or(unescaped.Closure().Tag(TokenType.UnescapedChar));
                     
 
-            m_jsonDFA = new RegularCharDFADefinition<TokenType>(new CharAlphabet()); 
-            BuildDFA(jsonExpression, m_jsonDFA, m_jsonDFA.InputAlphabet);
-
-
-            m_stringDFA = new RegularCharDFADefinition<TokenType>(new CharAlphabet());
-            BuildDFA(jsonStringExpression, m_jsonDFA, m_jsonDFA.InputAlphabet);
+            m_jsonDFA = BuildDFA(jsonExpression);
+            m_stringDFA = BuildDFA(jsonStringExpression);
         }
 
-        public RegularCharDFADefinition<TokenType> JsonDFA {
+        public RegularDFA<char, TokenType> JsonDFA {
             get {
                 return m_jsonDFA;
             }
         }
 
-        public RegularDFADefinition<char,TokenType> JsonStringDFA {
+        public RegularDFA<char,TokenType> JsonStringDFA {
             get {
                 return m_stringDFA;
             }
@@ -110,6 +107,10 @@
         Token<TokenType> SymbolRangeToken(char start, char stop) {
             return SymbolToken(Enumerable.Range(start,stop - start).Cast<char>());
         }
+
+        protected override IAlphabetBuilder<char> CreateAlphabet() {
+            return new CharAlphabet();
+        }
                 
     }
 }
--- a/Implab/Formats/JSON/JSONParser.cs	Thu Mar 10 01:19:33 2016 +0300
+++ b/Implab/Formats/JSON/JSONParser.cs	Mon Mar 14 01:19:38 2016 +0300
@@ -86,9 +86,9 @@
             return Token<object>.New(input.Select(t => _alphabet.Translate(t)).ToArray());
         }
 
-        static RegularDFADefinition<JsonTokenType,object> CreateDFA(Token<object> expr) {
-            var builder = new RegularDFABuilder<object>();
-            var dfa = new RegularDFADefinition<JsonTokenType,object>(_alphabet);
+        static RegularDFA<JsonTokenType,object> CreateDFA(Token<object> expr) {
+            var builder = new RegularExpressionVisitor<object>();
+            var dfa = new RegularDFA<JsonTokenType,object>(_alphabet);
 
             expr.Accept(builder);
 
--- a/Implab/Formats/RegularCharDFADefinition.cs	Thu Mar 10 01:19:33 2016 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,17 +0,0 @@
-using System;
-using Implab.Automaton.RegularExpressions;
-
-namespace Implab.Formats {
-    public class RegularCharDFADefinition<TTag> : RegularDFADefinition<char,TTag>  {
-        readonly CharAlphabet m_alphabet;
-
-        public RegularCharDFADefinition(CharAlphabet alphabet) : base(alphabet) {
-            m_alphabet = alphabet;
-        }
-
-        public new CharAlphabet InputAlphabet {
-            get { return m_alphabet; }
-        }
-    }
-}
-
--- a/Implab/Implab.csproj	Thu Mar 10 01:19:33 2016 +0300
+++ b/Implab/Implab.csproj	Mon Mar 14 01:19:38 2016 +0300
@@ -170,7 +170,6 @@
     <Compile Include="Automaton\RegularExpressions\Token.cs" />
     <Compile Include="Automaton\RegularExpressions\IVisitor.cs" />
     <Compile Include="Automaton\AutomatonTransition.cs" />
-    <Compile Include="Automaton\RegularExpressions\RegularDFABuilder.cs" />
     <Compile Include="Formats\JSON\JSONElementContext.cs" />
     <Compile Include="Formats\JSON\JSONElementType.cs" />
     <Compile Include="Formats\JSON\JSONGrammar.cs" />
@@ -183,13 +182,14 @@
     <Compile Include="Formats\JSON\StringTranslator.cs" />
     <Compile Include="Automaton\MapAlphabet.cs" />
     <Compile Include="Automaton\DummyAlphabet.cs" />
-    <Compile Include="Automaton\RegularExpressions\RegularDFADefinition.cs" />
     <Compile Include="Formats\CharAlphabet.cs" />
     <Compile Include="Formats\ByteAlphabet.cs" />
-    <Compile Include="Formats\RegularCharDFADefinition.cs" />
     <Compile Include="Automaton\IDFATable.cs" />
     <Compile Include="Automaton\IDFATableBuilder.cs" />
     <Compile Include="Automaton\DFATable.cs" />
+    <Compile Include="Automaton\RegularExpressions\RegularDFA.cs" />
+    <Compile Include="Automaton\RegularExpressions\RegularExpressionVisitor.cs" />
+    <Compile Include="Automaton\RegularExpressions\ITaggedDFABuilder.cs" />
     <Compile Include="Automaton\RegularExpressions\DFAStateDescriptorT.cs" />
   </ItemGroup>
   <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />