Mercurial > pub > ImplabNet
changeset 162:0526412bbb26 ref20160224
DFA refactoring
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/AutomatonTransition.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,33 @@ +using System; + +namespace Implab.Automaton { + struct AutomatonTransition : IEquatable<AutomatonTransition> { + public readonly int s1; + public readonly int s2; + public readonly int edge; + + public AutomatonTransition(int s1, int s2, int edge) { + this.s1 = s1; + this.s2 = s2; + this.edge = edge; + } + + + #region IEquatable implementation + public bool Equals(AutomatonTransition other) { + return other.s1 == s1 && other.s2 == s2 && other.edge == edge ; + } + #endregion + + public override bool Equals(object obj) { + if (obj is AutomatonTransition) + return Equals((AutomatonTransition)obj); + return base.Equals(obj); + } + + public override int GetHashCode() { + return s1 + s2 + edge; + } + } +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/ByteAlphabet.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,23 @@ +using System; + +namespace Implab.Automaton { + public class ByteAlphabet : IndexedAlphabetBase<byte> { + public ByteAlphabet() : base(byte.MaxValue + 1){ + } + + #region implemented abstract members of IndexedAlphabetBase + + public override int GetSymbolIndex(byte symbol) { + return (int)symbol; + } + + public override System.Collections.Generic.IEnumerable<byte> InputSymbols { + get { + throw new NotImplementedException(); + } + } + + #endregion + } +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/CDFADefinition.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,22 @@ +namespace Implab.Automaton { + public class CDFADefinition : DFADefinition { + readonly CharAlphabet m_alphabet; + + public CharAlphabet Alphabet { + get { return m_alphabet; } + } + + public CDFADefinition(CharAlphabet alphabet): base(alphabet.Count) { + m_alphabet = alphabet; + } + + public CDFADefinition Optimize() { + + return (CDFADefinition)Optimize(alphabet => new CDFADefinition((CharAlphabet)alphabet), m_alphabet, new CharAlphabet()); + } + + public void PrintDFA() { + PrintDFA(m_alphabet); + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/CharAlphabet.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,23 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Implab.Automaton { + public class CharAlphabet: IndexedAlphabetBase<char> { + + public CharAlphabet() + : base(char.MaxValue + 1) { + } + + public override int GetSymbolIndex(char symbol) { + return symbol; + } + + public override IEnumerable<char> InputSymbols { + get { return Enumerable.Range(char.MinValue, char.MaxValue).Select(x => (char)x); } + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/DFAConst.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,10 @@ +using System; + +namespace Implab.Automaton { + public static class DFAConst { + public const int UNREACHABLE_STATE = -1; + + public const int UNCLASSIFIED_INPUT = 0; + } +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/DFADefinition.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,213 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Implab.Automaton { + public class DFADefinition<TInput, TState, TTag> : IDFADefinitionBuilder<TTag>, IDFADefinition<TInput, TState, TTag> { + DFAStateDescriptior<TTag>[] m_dfaTable; + readonly IAlphabet<TInput> m_inputAlphabet; + readonly IAlphabet<TState> m_stateAlphabet; + + readonly Dictionary<int, TTag[]> m_finalStates = new Dictionary<int, TTag[]>(); + readonly HashSet<AutomatonTransition> m_transitions = new HashSet<AutomatonTransition>(); + + public DFADefinition(IAlphabet<TInput> inputAlphabet, IAlphabet<TState> stateAlphabet) { + Safe.ArgumentNotNull(inputAlphabet, "inputAlphabet"); + Safe.ArgumentNotNull(stateAlphabet, "stateAlphabet"); + + m_inputAlphabet = inputAlphabet; + m_stateAlphabet = stateAlphabet; + } + + public void DefineTransition(int s1, int s2, int symbol) { + Safe.ArgumentInRange(s1, 0, m_stateAlphabet.Count-1, "s1"); + Safe.ArgumentInRange(s2, 0, m_stateAlphabet.Count-1, "s2"); + Safe.ArgumentInRange(symbol, 0, m_inputAlphabet.Count-1, "symbol"); + + m_transitions.Add(new AutomatonTransition(s1, s2, symbol)); + } + + + #region IDFADefinition implementation + + public DFAStateDescriptior<TTag>[] GetTransitionTable() { + if (m_dfaTable == null) { + m_dfaTable = new DFAStateDescriptior<TTag>[m_stateAlphabet.Count]; + + foreach (var pair in m_finalStates) { + var idx = pair.Key; + + m_dfaTable[idx].final = true; + m_dfaTable[idx].tag = m_dfaTable[idx].tag != null ? + m_dfaTable[idx].tag.Concat(pair.Value).Distinct().ToArray() : + pair.Value; + } + + foreach (var t in m_transitions) { + if (m_dfaTable[t.s1].transitions == null) { + m_dfaTable[t.s1].transitions = new int[m_inputAlphabet.Count]; + for (int i = 0; i < m_dfaTable[t.s1].transitions.Length; i++) + m_dfaTable[t.s1].transitions[i] = DFAConst.UNREACHABLE_STATE; + } + + m_dfaTable[t.s1].transitions[t.edge] = t.s2; + } + } + return m_dfaTable; + } + + public IAlphabet<TInput> InputAlphabet { + get { + return m_inputAlphabet; + } + } + + public IAlphabet<TState> StateAlphabet { + get { + return m_stateAlphabet; + } + } + + #endregion + + #region IDFADefinitionBuilder + + public void DefineTransition(int s1, int s2, int symbol) { + Safe.ArgumentInRange(s1, 0, m_stateAlphabet.Count - 1, "s1"); + Safe.ArgumentInRange(s2, 0, m_stateAlphabet.Count - 1, "s2"); + Safe.ArgumentInRange(symbol, 0, m_inputAlphabet.Count - 1, "symbol"); + + m_transitions.Add(new AutomatonTransition(s1, s2, symbol)); + } + + public void MarkFinalState(int state, params TTag[] tags) { + m_finalStates[state] = tags; + } + + + #endregion + + protected void Optimize(IDFADefinitionBuilder<TTag> optimalDFA,IAlphabetBuilder<TInput> optimalInputAlphabet, IAlphabetBuilder<TState> optimalStateAlphabet) { + Safe.ArgumentNotNull(optimalDFA, "dfa"); + Safe.ArgumentNotNull(optimalInputAlphabet, "optimalInputAlphabet"); + Safe.ArgumentNotNull(optimalStateAlphabet, "optimalStateAlphabet"); + + var setComparer = new CustomEqualityComparer<HashSet<int>>( + (x, y) => x.SetEquals(y), + s => s.Sum(x => x.GetHashCode()) + ); + + var arrayComparer = new CustomEqualityComparer<TTag[]>( + (x,y) => (new HashSet<int>(x)).SetEquals(new HashSet<int>(y)), + a => a.Sum(x => x.GetHashCode()) + ); + + var optimalStates = new HashSet<HashSet<int>>(setComparer); + var queue = new HashSet<HashSet<int>>(setComparer); + + // получаем конечные состояния, сгруппированные по маркерам + optimalStates.UnionWith( + m_finalStates + .GroupBy(pair => pair.Value, arrayComparer) + .Select( + g => new HashSet<int>( + g.Select( pair => pair.Key) + ) + ) + ); + + var state = new HashSet<int>( + Enumerable + .Range(0, m_stateAlphabet.Count - 1) + .Where(i => !m_finalStates.ContainsKey(i)) + ); + optimalStates.Add(state); + queue.Add(state); + + var rmap = m_transitions + .GroupBy(t => t.s2) + .ToLookup( + g => g.Key, // s2 + g => g.ToLookup(t => t.edge, t => t.s1) + ); + + while (queue.Count > 0) { + var stateA = queue.First(); + queue.Remove(stateA); + + for (int c = 0; c < m_inputAlphabet.Count; c++) { + var stateX = new HashSet<int>(); + foreach(var a in stateA) + stateX.UnionWith(rmap[a][c]); // all states from wich 'c' leads to 'a' + + foreach (var stateY in optimalStates.ToArray()) { + if (stateX.Overlaps(stateY) && !stateY.IsSubsetOf(stateX)) { + var stateR1 = new HashSet<int>(stateY); + var stateR2 = new HashSet<int>(stateY); + + stateR1.IntersectWith(stateX); + stateR2.ExceptWith(stateX); + + optimalStates.Remove(stateY); + optimalStates.Add(stateR1); + optimalStates.Add(stateR2); + + if (queue.Contains(stateY)) { + queue.Remove(stateY); + queue.Add(stateR1); + queue.Add(stateR2); + } else { + queue.Add(stateR1.Count <= stateR2.Count ? stateR1 : stateR2); + } + } + } + } + } + + // карта получения оптимального состояния по соотвествующему ему простому состоянию + var statesMap = m_stateAlphabet.Reclassify(optimalStateAlphabet, optimalStates); + + // получаем минимальный алфавит + // входные символы не различимы, если Move(s,a1) == Move(s,a2) + var optimalAlphabet = m_transitions + .GroupBy(t => Tuple.Create(statesMap[t.s1], statesMap[t.s2]), t => t.edge); + + var alphabetMap = m_inputAlphabet.Reclassify(optimalInputAlphabet, optimalAlphabet); + + var optimalTags = m_finalStates + .GroupBy(pair => statesMap[pair.Key]) + .ToDictionary( + g => g.Key, + g => g.SelectMany(pair => pair.Value).ToArray() + ); + + // построение автомата + foreach (var pair in optimalTags) + optimalDFA.MarkFinalState(pair.Key, pair.Value); + + foreach (var t in m_transitions.Select(t => new AutomatonTransition(statesMap[t.s1],statesMap[t.s2],alphabetMap[t.edge])).Distinct()) + optimalDFA.DefineTransition(t.s1, t.s2, t.edge); + } + + public void PrintDFA() { + + var inputMap = InputAlphabet.CreateReverseMap(); + var stateMap = StateAlphabet.CreateReverseMap(); + + for (int i = 0; i < inputMap.Length; i++) { + Console.WriteLine("C{0}: {1}", i, String.Join(",", inputMap[i])); + } + + foreach(var t in m_transitions) + Console.WriteLine( + "S{0} -{1}-> S{2}{3}", + stateMap[t.s1], + String.Join(",", inputMap[t.edge]), + stateMap[t.s2], + m_finalStates.ContainsKey(t.s2) ? "$" : "" + ); + + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/DFAStateDescriptor.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,13 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Implab.Automaton { + public struct DFAStateDescriptior<TTag> { + public bool final; + public TTag[] tag; + public int[] transitions; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/DFAutomaton.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,61 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Implab.Automaton { + public abstract class DFAutomaton<T> { + protected struct ContextFrame { + public DFAStateDescriptior[] states; + public int current; + public T info; + } + + public const int INITIAL_STATE = DFADefinition.INITIAL_STATE; + public const int UNREACHEBLE_STATE = DFADefinition.UNREACHEBLE_STATE; + + protected ContextFrame m_context; + Stack<ContextFrame> m_contextStack = new Stack<ContextFrame>(); + + protected int Level { + get { return m_contextStack.Count; } + } + + protected DFAutomaton(DFAStateDescriptior[] states, int startState, T info) { + Safe.ArgumentNotNull(states, "states"); + Safe.ArgumentInRange(startState, 0, states.Length - 1, "startState"); + + m_context.states = states; + m_context.current = startState; + m_context.info = info; + } + + protected void Switch(DFAStateDescriptior[] states, int current, T info) { + Debug.Assert(states != null); + Debug.Assert(current >= 0 && current < states.Length); + m_contextStack.Push(m_context); + m_context.states = states; + m_context.current = current; + m_context.info = info; + } + + protected void Restore() { + Debug.Assert(m_contextStack.Count > 0); + + m_context = m_contextStack.Pop(); + } + + protected void Move(int input) { + Debug.Assert(input > 0 && input < m_context.states[m_context.current].transitions.Length); + m_context.current = m_context.states[m_context.current].transitions[input]; + } + + protected bool CanMove(int input) { + Debug.Assert(input > 0 && input < m_context.states[m_context.current].transitions.Length); + return m_context.states[m_context.current].transitions[input] != UNREACHEBLE_STATE; + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/EDFADefinition.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,29 @@ +using Implab; +using System; + +namespace Implab.Parsing { + public class EDFADefinition<T> : DFADefinition where T : struct, IConvertible { + readonly EnumAlphabet<T> m_alphabet; + + public EnumAlphabet<T> Alphabet { + get { return m_alphabet; } + } + + public EDFADefinition(EnumAlphabet<T> alphabet) : base(alphabet.Count) { + m_alphabet = alphabet; + } + + public void DefineTransition(int s1, int s2, T input) { + DefineTransition(s1, s2, m_alphabet.Translate(input)); + } + + public EDFADefinition<T> Optimize() { + + return (EDFADefinition<T>)Optimize(alphabet => new EDFADefinition<T>((EnumAlphabet<T>)alphabet), m_alphabet, new EnumAlphabet<T>()); + } + + public void PrintDFA() { + PrintDFA(m_alphabet); + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/EnumAlphabet.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,67 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; +using System.Linq; +using System.Diagnostics.CodeAnalysis; + +namespace Implab.Automaton { + /// <summary> + /// Алфавит символами которого являются элементы перечислений. + /// </summary> + /// <typeparam name="T">Тип перечислений</typeparam> + public class EnumAlphabet<T> : IndexedAlphabetBase<T> where T : struct, IConvertible { + [SuppressMessage("Microsoft.Design", "CA1000:DoNotDeclareStaticMembersOnGenericTypes")] + static readonly T[] _symbols; + static readonly EnumAlphabet<T> _fullAlphabet; + + [SuppressMessage("Microsoft.Design", "CA1065:DoNotRaiseExceptionsInUnexpectedLocations")] + static EnumAlphabet() { + if (!typeof(T).IsEnum) + throw new InvalidOperationException("Invalid generic parameter, enumeration is required"); + + if (Enum.GetUnderlyingType(typeof(T)) != typeof(Int32)) + throw new InvalidOperationException("Only enums based on Int32 are supported"); + + _symbols = ((T[])Enum.GetValues(typeof(T))) + .OrderBy(x => x.ToInt32(CultureInfo.InvariantCulture)) + .ToArray(); + + if ( + _symbols[_symbols.Length - 1].ToInt32(CultureInfo.InvariantCulture) >= _symbols.Length + || _symbols[0].ToInt32(CultureInfo.InvariantCulture) != 0 + ) + throw new InvalidOperationException("The specified enumeration must be zero-based and continuously numbered"); + + _fullAlphabet = new EnumAlphabet<T>(_symbols.Select(x => x.ToInt32(CultureInfo.InvariantCulture)).ToArray()); + } + + + + public static EnumAlphabet<T> FullAlphabet { + get { + return _fullAlphabet; + } + } + + + public EnumAlphabet() + : base(_symbols.Length) { + } + + public EnumAlphabet(int[] map) + : base(map) { + Debug.Assert(map.Length == _symbols.Length); + } + + + public override int GetSymbolIndex(T symbol) { + return symbol.ToInt32(CultureInfo.InvariantCulture); + } + + public override IEnumerable<T> InputSymbols { + get { return _symbols; } + } + + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/IAlphabet.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,48 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Implab.Automaton { + /// <summary> + /// Алфавит. Множество символов, которые разбиты на классы, при этом классы имеют непрерывную нумерацию, + /// что позволяет использовать их в качестве индексов массивов. + /// </summary> + /// <remarks> + /// <para>Алфавит является сюрьективным отображением множества символов в множество индексов, это позволяет сократить размер таблицы переходов автомата + /// для входных символов, которые для него не различимы.</para> + /// </remarks> + /// <typeparam name="TSymbol">Тип символов.</typeparam> + public interface IAlphabet<TSymbol> { + /// <summary> + /// Количество классов символов в алфавите. + /// </summary> + int Count { get; } + + /// <summary> + /// Создает карту обратного сопоставления класса символов алфавита и сопоставленным + /// ему исходным символам. + /// </summary> + /// <returns></returns> + List<TSymbol>[] CreateReverseMap(); + + /// <summary> + /// Создает новый алфавит на основе текущего, горппируя его сиволы в более + /// крупные непересекающиеся классы символов. + /// </summary> + /// <param name="newAlphabet">Новый, пустой алфавит, в котором быдут определены классы.</param> + /// <param name="classes">Множество классов символов текущего алфавита.</param> + /// <returns>Карта для перехода классов текущего + /// алфавита к классам нового.</returns> + /// <remarks>Ползволяет укрупнить алфавит, объединив классы в текущем алфавите. Используется при оптимизации автомата.</remarks> + int[] Reclassify(IAlphabetBuilder<TSymbol> newAlphabet, IEnumerable<IEnumerable<int>> classes); + + /// <summary> + /// Преобразует входной символ в индекс символа из алфавита. + /// </summary> + /// <param name="symobl">Исходный символ</param> + /// <returns>Индекс в алфавите</returns> + int Translate(TSymbol symobl); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/IAlphabetBuilder.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,25 @@ + +using System.Collections.Generic; + +namespace Implab.Automaton { + public interface IAlphabetBuilder<TSymbol> : IAlphabet<TSymbol> { + /// <summary> + /// Добавляет новый символ в алфавит, если символ уже был добавлен, то + /// возвращается ранее сопоставленный с символом класс. + /// </summary> + /// <param name="symbol">Символ для добавления.</param> + /// <returns>Индекс класса, который попоставлен с символом.</returns> + int DefineSymbol(TSymbol symbol); + /// <summary> + /// Доабвляем класс символов. Множеству указанных исходных символов + /// будет сопоставлен символ в алфавите. + /// </summary> + /// <param name="symbols">Множестов исходных символов</param> + /// <returns>Идентификатор символа алфавита.</returns> + int DefineClass(IEnumerable<TSymbol> symbols); + + + + } +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/IDFADefinition.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,56 @@ + +namespace Implab.Automaton { + /// <summary> + /// Полностью описывает DFA автомат, его поведение, состояние и входные символы. + /// </summary> + /// <example> + /// class MyAutomaton { + /// int m_current; + /// readonly DFAStateDescriptor<string>[] m_automaton; + /// readonly IAlphabet<MyCommands> m_commands; + /// + /// public MyAutomaton(IDFADefinition<MyCommands,MyStates,string> definition) { + /// m_current = definition.StateAlphabet.Translate(MyStates.Initial); + /// m_automaton = definition.GetTransitionTable(); + /// m_commands = definition.InputAlphabet; + /// } + /// + /// // defined a method which will move the automaton to the next state + /// public void Move(MyCommands cmd) { + /// // use transition map to determine the next state + /// var next = m_automaton[m_current].transitions[m_commands.Translate(cmd)]; + /// + /// // validate that we aren't in the unreachable state + /// if (next == DFAConst.UNREACHABLE_STATE) + /// throw new InvalidOperationException("The specified command is invalid"); + /// + /// // if everything is ok + /// m_current = next; + /// } + /// } + /// </example> + public interface IDFADefinition<TInput, TState, TTag> { + /// <summary> + /// Алфавит входных символов + /// </summary> + /// <value>The input alphabet.</value> + IAlphabet<TInput> InputAlphabet { + get; + } + + /// <summary> + /// Алфавит состояний автомата + /// </summary> + /// <value>The state alphabet.</value> + IAlphabet<TState> StateAlphabet { + get; + } + + /// <summary> + /// Таблица переходов состояний автомата + /// </summary> + /// <returns>The transition table.</returns> + DFAStateDescriptior<TTag>[] GetTransitionTable(); + + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/IDFADefinitionBuilder.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,23 @@ +using System; + +namespace Implab.Automaton { + public interface IDFADefinitionBuilder<TTag> { + /// <summary> + /// Marks the state as final and assings tags. + /// </summary> + /// <param name="state">State.</param> + /// <param name="tags">Tags.</param> + void MarkFinalState(int state, params TTag[] tags); + + /// <summary> + /// Defines the transition from <paramref name="s1"/> to + /// <paramref name="s2"/> with input <paramref name="symbol"/>. + /// </summary> + /// <param name="s1">S1.</param> + /// <param name="s2">S2.</param> + /// <param name="symbol">Symbol.</param> + void DefineTransition(int s1, int s2, int symbol); + + } +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/IndexedAlphabetBase.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,105 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; + +namespace Implab.Automaton { + /// <summary> + /// Indexed alphabet is the finite set of symbols where each symbol has a zero-based unique index. + /// </summary> + public abstract class IndexedAlphabetBase<T> : IAlphabetBuilder<T> { + public const int UNCLASSIFIED = 0; + + int m_nextId = 1; + readonly int[] m_map; + + public int Count { + get { return m_nextId; } + } + + protected IndexedAlphabetBase(int mapSize) { + m_map = new int[mapSize]; + } + + protected IndexedAlphabetBase(int[] map) { + Debug.Assert(map != null); + + m_map = map; + m_nextId = map.Max() + 1; + } + + public int DefineSymbol(T symbol) { + var index = GetSymbolIndex(symbol); + if (m_map[index] == UNCLASSIFIED) + m_map[index] = m_nextId++; + return m_map[index]; + } + + public int DefineClass(IEnumerable<T> symbols) { + Safe.ArgumentNotNull(symbols, "symbols"); + symbols = symbols.Distinct(); + + foreach (var symbol in symbols) { + var index = GetSymbolIndex(symbol); + if (m_map[index] == UNCLASSIFIED) + m_map[GetSymbolIndex(symbol)] = m_nextId; + else + throw new InvalidOperationException(String.Format("Symbol '{0}' already in use", symbol)); + } + return m_nextId++; + } + + public List<T>[] CreateReverseMap() { + return + Enumerable.Range(UNCLASSIFIED, Count) + .Select( + i => InputSymbols + .Where(x => i != UNCLASSIFIED && m_map[GetSymbolIndex(x)] == i) + .ToList() + ) + .ToArray(); + } + + public int[] Reclassify(IAlphabetBuilder<T> newAlphabet, IEnumerable<IEnumerable<int>> classes) { + Safe.ArgumentNotNull(newAlphabet, "newAlphabet"); + Safe.ArgumentNotNull(classes, "classes"); + var reverseMap = CreateReverseMap(); + + var translationMap = new int[Count]; + + foreach (var scl in classes) { + // skip if the supper class contains the unclassified element + if (scl.Contains(UNCLASSIFIED)) + continue; + var range = new List<T>(); + foreach (var cl in scl) { + if (cl < 0 || cl >= reverseMap.Length) + throw new ArgumentOutOfRangeException(String.Format("Class {0} is not valid for the current alphabet", cl)); + range.AddRange(reverseMap[cl]); + } + var newClass = newAlphabet.DefineClass(range); + foreach (var cl in scl) + translationMap[cl] = newClass; + } + + return translationMap; + } + + public virtual int Translate(T symbol) { + return m_map[GetSymbolIndex(symbol)]; + } + + public abstract int GetSymbolIndex(T symbol); + + public abstract IEnumerable<T> InputSymbols { get; } + + /// <summary> + /// Gets the translation map from the index of the symbol to it's class this is usefull for the optimized input symbols transtaion. + /// </summary> + /// <returns>The translation map.</returns> + public int[] GetTranslationMap() { + return m_map; + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/ParserException.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,17 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Implab.Automaton { + [Serializable] + public class ParserException : Exception { + public ParserException() { } + public ParserException(string message) : base(message) { } + public ParserException(string message, Exception inner) : base(message, inner) { } + protected ParserException( + System.Runtime.Serialization.SerializationInfo info, + System.Runtime.Serialization.StreamingContext context) + : base(info, context) { } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/RegularExpressions/AltToken.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,17 @@ +using System; + +namespace Implab.Automaton.RegularExpressions { + public class AltToken<TTag>: BinaryToken<TTag> { + public AltToken(Token<TTag> left, Token<TTag> right) + : base(left, right) { + } + + public override void Accept(IVisitor<TTag> visitor) { + Safe.ArgumentNotNull(visitor, "visitor"); + visitor.Visit(this); + } + public override string ToString() { + return String.Format(Right is BinaryToken<TTag> ? "{0}|({1})" : "{0}|{1}", Left, Right); + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/RegularExpressions/BinaryToken.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,21 @@ +using Implab; + +namespace Implab.Automaton.RegularExpressions { + public abstract class BinaryToken<TTag> : Token<TTag> { + readonly Token<TTag> m_left; + readonly Token<TTag> m_right; + + public Token<TTag> Left { + get { return m_left; } + } + + public Token<TTag> Right { + get { return m_right; } + } + + protected BinaryToken(Token<TTag> left, Token<TTag> right) { + Safe.ArgumentNotNull(m_left = left, "left"); + Safe.ArgumentNotNull(m_right = right, "right"); + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/RegularExpressions/CatToken.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,22 @@ +using System; + +namespace Implab.Automaton.RegularExpressions { + public class CatToken<TTag> : BinaryToken<TTag> { + public CatToken(Token<TTag> left, Token<TTag> right) + : base(left, right) { + } + + public override void Accept(IVisitor<TTag> visitor) { + Safe.ArgumentNotNull(visitor, "visitor"); + visitor.Visit(this); + } + + public override string ToString() { + return String.Format("{0}{1}", FormatToken(Left), FormatToken(Right)); + } + + static string FormatToken(Token<TTag> token) { + return String.Format(token is AltToken<TTag> ? "({0})" : "{0}", token); + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/RegularExpressions/DFABuilder.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,181 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; + +namespace Implab.Automaton.RegularExpressions { + /// <summary> + /// Используется для построения ДКА по регулярному выражению, сначала обходит + /// регулярное выражение и вычисляет followpos, затем используется метод + /// <see cref="BuildDFA(IDFADefinition)"/> для построения автомата. + /// </summary> + public class DFABuilder<TTag> : IVisitor<TTag> { + int m_idx = 0; + Token<TTag> m_root; + HashSet<int> m_firstpos; + HashSet<int> m_lastpos; + + readonly Dictionary<int, HashSet<int>> m_followpos = new Dictionary<int, HashSet<int>>(); + readonly Dictionary<int, int> m_indexes = new Dictionary<int, int>(); + readonly Dictionary<int, TTag> m_ends = new Dictionary<int, TTag>(); + + public Dictionary<int, HashSet<int>> FollowposMap { + get { return m_followpos; } + } + + public HashSet<int> Followpos(int pos) { + HashSet<int> set; + if (m_followpos.TryGetValue(pos, out set)) + return set; + return m_followpos[pos] = new HashSet<int>(); + } + + bool Nullable(object n) { + if (n is EmptyToken<TTag> || n is StarToken<TTag>) + return true; + if (n is AltToken<TTag>) + return Nullable(((AltToken<TTag>)n).Left) || Nullable(((AltToken<TTag>)n).Right); + if (n is CatToken<TTag>) + return Nullable(((CatToken<TTag>)n).Left) && Nullable(((CatToken<TTag>)n).Right); + return false; + } + + + public void Visit(AltToken<TTag> token) { + if (m_root == null) + m_root = token; + var firtspos = new HashSet<int>(); + var lastpos = new HashSet<int>(); + + token.Left.Accept(this); + firtspos.UnionWith(m_firstpos); + lastpos.UnionWith(m_lastpos); + + token.Right.Accept(this); + firtspos.UnionWith(m_firstpos); + lastpos.UnionWith(m_lastpos); + + m_firstpos = firtspos; + m_lastpos = lastpos; + } + + public void Visit(StarToken<TTag> token) { + if (m_root == null) + m_root = token; + token.Token.Accept(this); + + foreach (var i in m_lastpos) + Followpos(i).UnionWith(m_firstpos); + } + + public void Visit(CatToken<TTag> token) { + if (m_root == null) + m_root = token; + + var firtspos = new HashSet<int>(); + var lastpos = new HashSet<int>(); + token.Left.Accept(this); + firtspos.UnionWith(m_firstpos); + var leftLastpos = m_lastpos; + + token.Right.Accept(this); + lastpos.UnionWith(m_lastpos); + var rightFirstpos = m_firstpos; + + if (Nullable(token.Left)) + firtspos.UnionWith(rightFirstpos); + + if (Nullable(token.Right)) + lastpos.UnionWith(leftLastpos); + + m_firstpos = firtspos; + m_lastpos = lastpos; + + foreach (var i in leftLastpos) + Followpos(i).UnionWith(rightFirstpos); + + } + + public void Visit(EmptyToken<TTag> token) { + if (m_root == null) + m_root = token; + } + + public void Visit(SymbolToken<TTag> token) { + if (m_root == null) + m_root = token; + m_idx++; + m_indexes[m_idx] = token.Value; + m_firstpos = new HashSet<int>(new[] { m_idx }); + m_lastpos = new HashSet<int>(new[] { m_idx }); + } + + public void Visit(EndToken<TTag> token) { + if (m_root == null) + m_root = token; + m_idx++; + m_indexes[m_idx] = DFAConst.UNCLASSIFIED_INPUT; + m_firstpos = new HashSet<int>(new[] { m_idx }); + m_lastpos = new HashSet<int>(new[] { m_idx }); + Followpos(m_idx); + m_ends.Add(m_idx, token.Tag); + } + + public void BuildDFA(IDFADefinitionBuilder<TTag> dfa, IAlphabetBuilder<int> states) { + Safe.ArgumentNotNull(dfa,"dfa"); + + var stateMap = new Dictionary<HashSet<int>, int>(new CustomEqualityComparer<HashSet<int>>( + (x, y) => x.SetEquals(y), + x => x.Sum(n => n.GetHashCode()) + )); + + int nextState = 0; + + int initialState = states.DefineSymbol(nextState++); + stateMap[m_firstpos] = initialState; + + var tags = GetStateTags(m_firstpos); + if (tags != null && tags.Length > 0) + dfa.MarkFinalState(initialState, tags); + + var inputMax = m_indexes.Values.Max(); + var queue = new Queue<HashSet<int>>(); + + queue.Enqueue(m_firstpos); + + while (queue.Count > 0) { + var state = queue.Dequeue(); + var s1 = stateMap[state]; + + for (int a = 0; a <= inputMax; a++) { + var next = new HashSet<int>(); + foreach (var p in state) { + if (m_indexes[p] == a) { + next.UnionWith(Followpos(p)); + } + } + if (next.Count > 0) { + int s2; + if (!stateMap.TryGetValue(next, out s2)) { + s2 = states.DefineSymbol(nextState++); + stateMap[next] = s2; + tags = GetStateTags(next); + if (tags != null && tags.Length > 0) + dfa.MarkFinalState(s2, tags); + + queue.Enqueue(next); + } + dfa.DefineTransition(s1, s2, a); + } + } + } + } + + TTag[] GetStateTags(IEnumerable<int> state) { + Debug.Assert(state != null); + return state.Where(m_ends.ContainsKey).Select(pos => m_ends[pos]).ToArray(); + } + + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/RegularExpressions/EmptyToken.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,13 @@ +using Implab; + +namespace Implab.Automaton.RegularExpressions { + public class EmptyToken<TTag> : Token<TTag> { + public override void Accept(IVisitor<TTag> visitor) { + Safe.ArgumentNotNull(visitor, "visitor"); + visitor.Visit(this); + } + public override string ToString() { + return "$"; + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/RegularExpressions/EndToken.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,32 @@ +using Implab; + +namespace Implab.Automaton.RegularExpressions { + /// <summary> + /// Конечный символ расширенного регулярного выражения, при построении ДКА + /// используется для определения конечных состояний. + /// </summary> + public class EndToken<TTag>: Token<TTag> { + + TTag m_tag; + + public EndToken(TTag tag) { + m_tag = tag; + } + + public EndToken() + : this(0) { + } + + public TTag Tag { + get { return m_tag; } + } + + public override void Accept(IVisitor<TTag> visitor) { + Safe.ArgumentNotNull(visitor, "visitor"); + visitor.Visit(this); + } + public override string ToString() { + return "#"; + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/RegularExpressions/Grammar.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,108 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Implab.Automaton.RegularExpressions { + /// <summary> + /// Базовый абстрактный класс. Грамматика, позволяет формулировать выражения над алфавитом типа <c>char</c>. + /// </summary> + /// <typeparam name="TGrammar"></typeparam> + public abstract class Grammar<TGrammar> where TGrammar: Grammar<TGrammar>, new() { + static TGrammar _instance; + + public static TGrammar Instance{ + get { + if (_instance == null) + _instance = new TGrammar(); + return _instance; + } + } + + readonly CharAlphabet m_alphabet = new CharAlphabet(); + + public CharAlphabet Alphabet { + get { return m_alphabet; } + } + + public SymbolToken UnclassifiedToken() { + return new SymbolToken(CharAlphabet.UNCLASSIFIED); + } + + public void DefineAlphabet(IEnumerable<char> alphabet) { + Safe.ArgumentNotNull(alphabet, "alphabet"); + + foreach (var ch in alphabet) + m_alphabet.DefineSymbol(ch); + } + public Token SymbolRangeToken(char start, char end) { + return SymbolToken(Enumerable.Range(start, end - start + 1).Select(x => (char)x)); + } + + public Token SymbolToken(char symbol) { + return Token.New(TranslateOrAdd(symbol)); + } + + public Token SymbolToken(IEnumerable<char> symbols) { + Safe.ArgumentNotNull(symbols, "symbols"); + + return Token.New(TranslateOrAdd(symbols).ToArray()); + } + + public Token SymbolSetToken(params char[] set) { + return SymbolToken(set); + } + + int TranslateOrAdd(char ch) { + var t = m_alphabet.Translate(ch); + if (t == CharAlphabet.UNCLASSIFIED) + t = m_alphabet.DefineSymbol(ch); + return t; + } + + IEnumerable<int> TranslateOrAdd(IEnumerable<char> symbols) { + return symbols.Distinct().Select(TranslateOrAdd); + } + + int TranslateOrDie(char ch) { + var t = m_alphabet.Translate(ch); + if (t == CharAlphabet.UNCLASSIFIED) + throw new ApplicationException(String.Format("Symbol '{0}' is UNCLASSIFIED", ch)); + return t; + } + + IEnumerable<int> TranslateOrDie(IEnumerable<char> symbols) { + return symbols.Distinct().Select(TranslateOrDie); + } + + public Token SymbolTokenExcept(IEnumerable<char> symbols) { + Safe.ArgumentNotNull(symbols, "symbols"); + + return Token.New( Enumerable.Range(0, m_alphabet.Count).Except(TranslateOrDie(symbols)).ToArray()); + } + + protected CDFADefinition BuildDFA(Token lang) { + Safe.ArgumentNotNull(lang, "lang"); + + var dfa = new CDFADefinition(m_alphabet); + + var builder = new DFABuilder(); + + lang.Accept( builder ); + + builder.BuildDFA(dfa); + if (dfa.InitialStateIsFinal) + throw new ApplicationException("The specified language contains empty token"); + + return dfa.Optimize(); + } + + + + //protected abstract TGrammar CreateInstance(); + } + + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/RegularExpressions/IVisitor.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,13 @@ +namespace Implab.Automaton.RegularExpressions { + /// <summary> + /// Интерфейс обходчика синтаксического дерева регулярного выражения + /// </summary> + public interface IVisitor<TTag> { + void Visit(AltToken<TTag> token); + void Visit(StarToken<TTag> token); + void Visit(CatToken<TTag> token); + void Visit(EmptyToken<TTag> token); + void Visit(EndToken<TTag> token); + void Visit(SymbolToken<TTag> token); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/RegularExpressions/StarToken.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,34 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Implab.Automaton.RegularExpressions { + /// <summary> + /// Замыкание выражения с 0 и более повторов. + /// </summary> + public class StarToken<TTag>: Token<TTag> { + + Token<TTag> m_token; + + public Token<TTag> Token { + get { return m_token; } + } + + public StarToken(Token<TTag> token) { + Safe.ArgumentNotNull(token, "token"); + m_token = token; + } + + public override void Accept(IVisitor<TTag> visitor) { + Safe.ArgumentNotNull(visitor, "visitor"); + visitor.Visit(this); + } + + public override string ToString() { + return String.Format("({0})*", Token); + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/RegularExpressions/SymbolToken.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,27 @@ +using Implab; + +namespace Implab.Automaton.RegularExpressions { + /// <summary> + /// Выражение, соответсвующее одному символу. + /// </summary> + public class SymbolToken<TTag> : Token<TTag> { + int m_value; + + public int Value { + get { return m_value; } + } + + public SymbolToken(int value) { + m_value = value; + } + public override void Accept(IVisitor<TTag> visitor) { + Safe.ArgumentNotNull(visitor, "visitor"); + + visitor.Visit(this); + } + + public override string ToString() { + return Value.ToString(); + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/RegularExpressions/Token.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,63 @@ +using Implab; +using System; +using System.Linq; + +namespace Implab.Automaton.RegularExpressions { + public abstract class Token<TTag> { + public abstract void Accept(IVisitor<TTag> visitor); + + public Token<TTag> Extend() { + return Cat(new EndToken<TTag>()); + } + + public Token<TTag> Tag<T>(T tag) where T : IConvertible { + return Cat(new EndToken<TTag>(tag)); + } + + public Token<TTag> Cat(Token<TTag> right) { + return new CatToken<TTag>(this, right); + } + + public Token<TTag> Or(Token<TTag> right) { + return new AltToken<TTag>(this, right); + } + + public Token<TTag> Optional() { + return Or(new EmptyToken<TTag>()); + } + + public Token<TTag> EClosure() { + return new StarToken<TTag>(this); + } + + public Token<TTag> Closure() { + return Cat(new StarToken<TTag>(this)); + } + + public Token<TTag> Repeat(int count) { + Token<TTag> token = null; + + for (int i = 0; i < count; i++) + token = token != null ? token.Cat(this) : this; + return token ?? new EmptyToken<TTag>(); + } + + public Token<TTag> Repeat(int min, int max) { + if (min > max || min < 1) + throw new ArgumentOutOfRangeException(); + var token = Repeat(min); + + for (int i = min; i < max; i++) + token = token.Cat( this.Optional() ); + return token; + } + + public static Token<TTag> New<T>(params T[] set) where T : struct, IConvertible { + Safe.ArgumentNotNull(set, "set"); + Token<TTag> token = null; + foreach(var c in set.Distinct()) + token = token == null ? new SymbolToken<TTag>(c) : token.Or(new SymbolToken<TTag>(c)); + return token; + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Implab/Automaton/Scanner.cs Wed Feb 24 08:39:53 2016 +0300 @@ -0,0 +1,259 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.IO; +using Implab.Components; + +namespace Implab.Automaton { + /// <summary> + /// Базовый класс для разбора потока входных символов на токены. + /// </summary> + /// <remarks> + /// Сканнер имеет внутри буффер с симолами входного текста, по которому перемещаются два + /// указателя, начала и конца токена, при перемещении искользуется ДКА для определения + /// конца токена и допустимости текущего символа. + /// </remarks> + public abstract class Scanner<TTag> : Disposable { + struct ScannerConfig { + public DFAStateDescriptior<TTag>[] states; + public int[] alphabetMap; + } + + Stack<ScannerConfig> m_defs = new Stack<ScannerConfig>(); + + DFAStateDescriptior<TTag>[] m_states; + int[] m_alphabetMap; + + protected DFAStateDescriptior<TTag> m_currentState; + int m_previewCode; + + protected int m_tokenLen = 0; + protected int m_tokenOffset; + + protected char[] m_buffer; + protected int m_bufferSize; + protected int m_pointer; + + TextReader m_reader; + bool m_disposeReader; + int m_chunkSize = 1024; // 1k + int m_limit = 10 * 1024 * 1024; // 10Mb + + protected Scanner(DFAStateDescriptior<TTag>[] states, int[] alphabet) { + Safe.ArgumentNotEmpty(states, "states"); + Safe.ArgumentNotNull(alphabet, "alphabet"); + + m_states = states; + m_alphabetMap = alphabet; + + Feed(new char[0]); + } + + /// <summary> + /// Заполняет входными данными буффер. + /// </summary> + /// <param name="data">Данные для обработки.</param> + /// <remarks>Копирование данных не происходит, переданный массив используется в + /// качестве входного буффера.</remarks> + public void Feed(char[] data) { + Safe.ArgumentNotNull(data, "data"); + + Feed(data, data.Length); + } + + /// <summary> + /// Заполняет буффур чтения входными данными. + /// </summary> + /// <param name="data">Данные для обработки.</param> + /// <param name="length">Длина данных для обработки.</param> + /// <remarks>Копирование данных не происходит, переданный массив используется в + /// качестве входного буффера.</remarks> + public void Feed(char[] data, int length) { + Safe.ArgumentNotNull(data, "data"); + Safe.ArgumentInRange(length, 0, data.Length, "length"); + AssertNotDisposed(); + + m_pointer = -1; + m_buffer = data; + m_bufferSize = length; + Shift(); + } + + public void Feed(TextReader reader, bool dispose) { + Safe.ArgumentNotNull(reader, "reader"); + AssertNotDisposed(); + + if (m_reader != null && m_disposeReader) + m_reader.Dispose(); + + m_reader = reader; + m_disposeReader = dispose; + m_pointer = -1; + m_buffer = new char[m_chunkSize]; + m_bufferSize = 0; + Shift(); + } + + /// <summary> + /// Получает текущий токен в виде строки. + /// </summary> + /// <returns></returns> + protected string GetTokenValue() { + return new String(m_buffer, m_tokenOffset, m_tokenLen); + } + + /// <summary> + /// Метки текущего токена, которые были назначены в регулярном выражении. + /// </summary> + protected TTag[] TokenTags { + get { + return m_currentState.tag; + } + } + + /// <summary> + /// Признак конца данных + /// </summary> + public bool EOF { + get { + return m_pointer >= m_bufferSize; + } + } + + /// <summary> + /// Читает следующий токен, при этом <see cref="m_tokenOffset"/> указывает на начало токена, + /// <see cref="m_tokenLen"/> на длину токена, <see cref="m_buffer"/> - массив символов, в + /// котором находится токен. + /// </summary> + /// <returns><c>false</c> - достигнут конец данных, токен не прочитан.</returns> + protected bool ReadTokenInternal() { + if (m_pointer >= m_bufferSize) + return false; + + m_currentState = m_states[DFADefinition.INITIAL_STATE]; + m_tokenLen = 0; + m_tokenOffset = m_pointer; + int nextState; + do { + nextState = m_currentState.transitions[m_previewCode]; + if (nextState == DFAConst.UNREACHABLE_STATE) { + if (m_currentState.final) + return true; + else + throw new ParserException( + String.Format( + "Unexpected symbol '{0}', at pos {1}", + m_buffer[m_pointer], + Position + ) + ); + } else { + m_currentState = m_states[nextState]; + m_tokenLen++; + } + + } while (Shift()); + + // END OF DATA + if (!m_currentState.final) + throw new ParserException("Unexpected end of data"); + + return true; + } + + + bool Shift() { + m_pointer++; + + if (m_pointer >= m_bufferSize) { + if (!ReadNextChunk()) + return false; + } + + m_previewCode = m_alphabetMap[m_buffer[m_pointer]]; + + return true; + } + + bool ReadNextChunk() { + if (m_reader == null) + return false; + + // extend buffer if nesessary + if (m_pointer + m_chunkSize > m_buffer.Length) { + // trim unused buffer head + var size = m_tokenLen + m_chunkSize; + if (size >= m_limit) + throw new ParserException(String.Format("Input buffer {0} bytes limit exceeded", m_limit)); + var temp = new char[size]; + Array.Copy(m_buffer, m_tokenOffset, temp, 0, m_tokenLen); + m_pointer -= m_tokenOffset; + m_bufferSize -= m_tokenOffset; + m_tokenOffset = 0; + m_buffer = temp; + } + + var read = m_reader.Read(m_buffer, m_tokenLen, m_chunkSize); + if (read == 0) + return false; + + m_bufferSize += read; + + return true; + } + + /// <summary> + /// Позиция сканнера во входном буфере + /// </summary> + public int Position { + get { + return m_pointer + 1; + } + } + + /// <summary> + /// Преключает внутренний ДКА на указанный, позволяет реализовать подобие захватывающей + /// группировки. + /// </summary> + /// <param name="states">Таблица состояний нового ДКА</param> + /// <param name="alphabet">Таблица входных символов для нового ДКА</param> + protected void Switch(DFAStateDescriptior<TTag>[] states, int[] alphabet) { + Safe.ArgumentNotNull(states, "dfa"); + + m_defs.Push(new ScannerConfig { + states = m_states, + alphabetMap = m_alphabetMap + }); + + m_states = states; + m_alphabetMap = alphabet; + + m_previewCode = m_alphabetMap[m_buffer[m_pointer]]; + } + + /// <summary> + /// Восстанавливает предыдущей ДКА сканнера. + /// </summary> + protected void Restore() { + if (m_defs.Count == 0) + throw new InvalidOperationException(); + var prev = m_defs.Pop(); + m_states = prev.states; + m_alphabetMap = prev.alphabetMap; + m_previewCode = m_alphabetMap[m_buffer[m_pointer]]; + } + + protected override void Dispose(bool disposing) { + if (disposing) { + if (m_reader != null && m_disposeReader) + m_reader.Dispose(); + m_buffer = null; + m_bufferSize = 0; + m_pointer = 0; + m_tokenLen = 0; + m_tokenOffset = 0; + } + base.Dispose(disposing); + } + } +}
--- a/Implab/Implab.csproj Fri Feb 19 18:07:17 2016 +0300 +++ b/Implab/Implab.csproj Wed Feb 24 08:39:53 2016 +0300 @@ -102,26 +102,6 @@ <Compile Include="Parallels\ArrayTraits.cs" /> <Compile Include="Parallels\MTQueue.cs" /> <Compile Include="Parallels\WorkerPool.cs" /> - <Compile Include="Parsing\AltToken.cs" /> - <Compile Include="Parsing\BinaryToken.cs" /> - <Compile Include="Parsing\CatToken.cs" /> - <Compile Include="Parsing\CDFADefinition.cs" /> - <Compile Include="Parsing\DFABuilder.cs" /> - <Compile Include="Parsing\DFAStateDescriptor.cs" /> - <Compile Include="Parsing\DFAutomaton.cs" /> - <Compile Include="Parsing\EDFADefinition.cs" /> - <Compile Include="Parsing\EmptyToken.cs" /> - <Compile Include="Parsing\EndToken.cs" /> - <Compile Include="Parsing\EnumAlphabet.cs" /> - <Compile Include="Parsing\Grammar.cs" /> - <Compile Include="Parsing\IAlphabet.cs" /> - <Compile Include="Parsing\IDFADefinition.cs" /> - <Compile Include="Parsing\IVisitor.cs" /> - <Compile Include="Parsing\ParserException.cs" /> - <Compile Include="Parsing\Scanner.cs" /> - <Compile Include="Parsing\StarToken.cs" /> - <Compile Include="Parsing\SymbolToken.cs" /> - <Compile Include="Parsing\Token.cs" /> <Compile Include="ProgressInitEventArgs.cs" /> <Compile Include="Properties\AssemblyInfo.cs" /> <Compile Include="Parallels\AsyncPool.cs" /> @@ -178,11 +158,34 @@ <Compile Include="Components\ExecutionState.cs" /> <Compile Include="Components\RunnableComponent.cs" /> <Compile Include="Components\IFactory.cs" /> - <Compile Include="Parsing\DFADefinition.cs" /> - <Compile Include="Parsing\IndexedAlphabetBase.cs" /> - <Compile Include="Parsing\CharAlphabet.cs" /> - <Compile Include="Parsing\IAlphabetBuilder.cs" /> - <Compile Include="Parsing\IDFADefinitionBuilder.cs" /> + <Compile Include="Automaton\CDFADefinition.cs" /> + <Compile Include="Automaton\DFAStateDescriptor.cs" /> + <Compile Include="Automaton\DFAutomaton.cs" /> + <Compile Include="Automaton\EDFADefinition.cs" /> + <Compile Include="Automaton\EnumAlphabet.cs" /> + <Compile Include="Automaton\IAlphabet.cs" /> + <Compile Include="Automaton\IDFADefinition.cs" /> + <Compile Include="Automaton\ParserException.cs" /> + <Compile Include="Automaton\Scanner.cs" /> + <Compile Include="Automaton\DFADefinition.cs" /> + <Compile Include="Automaton\IndexedAlphabetBase.cs" /> + <Compile Include="Automaton\CharAlphabet.cs" /> + <Compile Include="Automaton\IAlphabetBuilder.cs" /> + <Compile Include="Automaton\IDFADefinitionBuilder.cs" /> + <Compile Include="Automaton\RegularExpressions\AltToken.cs" /> + <Compile Include="Automaton\RegularExpressions\BinaryToken.cs" /> + <Compile Include="Automaton\RegularExpressions\CatToken.cs" /> + <Compile Include="Automaton\DFAConst.cs" /> + <Compile Include="Automaton\RegularExpressions\Grammar.cs" /> + <Compile Include="Automaton\RegularExpressions\StarToken.cs" /> + <Compile Include="Automaton\RegularExpressions\SymbolToken.cs" /> + <Compile Include="Automaton\RegularExpressions\EmptyToken.cs" /> + <Compile Include="Automaton\RegularExpressions\EndToken.cs" /> + <Compile Include="Automaton\RegularExpressions\Token.cs" /> + <Compile Include="Automaton\RegularExpressions\IVisitor.cs" /> + <Compile Include="Automaton\RegularExpressions\DFABuilder.cs" /> + <Compile Include="Automaton\AutomatonTransition.cs" /> + <Compile Include="Automaton\ByteAlphabet.cs" /> </ItemGroup> <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" /> <ItemGroup /> @@ -257,5 +260,6 @@ </ProjectExtensions> <ItemGroup> <Folder Include="Components\" /> + <Folder Include="Automaton\RegularExpressions\" /> </ItemGroup> </Project> \ No newline at end of file
--- a/Implab/Parsing/AltToken.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,17 +0,0 @@ -using System; - -namespace Implab.Parsing { - public class AltToken: BinaryToken { - public AltToken(Token left, Token right) - : base(left, right) { - } - - public override void Accept(IVisitor visitor) { - Safe.ArgumentNotNull(visitor, "visitor"); - visitor.Visit(this); - } - public override string ToString() { - return String.Format(Right is BinaryToken ? "{0}|({1})" : "{0}|{1}", Left, Right); - } - } -}
--- a/Implab/Parsing/BinaryToken.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - public abstract class BinaryToken : Token { - Token m_left; - Token m_right; - - public Token Left { - get { return m_left; } - } - - public Token Right { - get { return m_right; } - } - - protected BinaryToken(Token left, Token right) { - Safe.ArgumentNotNull(m_left = left, "left"); - Safe.ArgumentNotNull(m_right = right, "right"); - } - } -}
--- a/Implab/Parsing/CDFADefinition.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ -namespace Implab.Parsing { - public class CDFADefinition : DFADefinition { - readonly CharAlphabet m_alphabet; - - public CharAlphabet Alphabet { - get { return m_alphabet; } - } - - public CDFADefinition(CharAlphabet alphabet): base(alphabet.Count) { - m_alphabet = alphabet; - } - - public CDFADefinition Optimize() { - - return (CDFADefinition)Optimize(alphabet => new CDFADefinition((CharAlphabet)alphabet), m_alphabet, new CharAlphabet()); - } - - public void PrintDFA() { - PrintDFA(m_alphabet); - } - } -}
--- a/Implab/Parsing/CatToken.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ -using System; - -namespace Implab.Parsing { - public class CatToken : BinaryToken { - public CatToken(Token left, Token right) - : base(left, right) { - } - - public override void Accept(IVisitor visitor) { - Safe.ArgumentNotNull(visitor, "visitor"); - visitor.Visit(this); - } - - public override string ToString() { - return String.Format("{0}{1}", FormatToken(Left), FormatToken(Right)); - } - - string FormatToken(Token token) { - return String.Format(token is AltToken ? "({0})" : "{0}", token); - } - } -}
--- a/Implab/Parsing/CharAlphabet.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - public class CharAlphabet: IndexedAlphabetBase<char> { - - public CharAlphabet() - : base(char.MaxValue + 1) { - } - - public override int GetSymbolIndex(char symbol) { - return symbol; - } - - public override IEnumerable<char> InputSymbols { - get { return Enumerable.Range(char.MinValue, char.MaxValue).Select(x => (char)x); } - } - } -}
--- a/Implab/Parsing/DFABuilder.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,180 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Linq; - -namespace Implab.Parsing { - /// <summary> - /// Используется для построения ДКА по регулярному выражению, сначала обходит - /// регулярное выражение и вычисляет followpos, затем используется метод - /// <see cref="BuildDFA(IDFADefinition)"/> для построения автомата. - /// </summary> - public class DFABuilder : IVisitor { - int m_idx = 0; - Token m_root; - HashSet<int> m_firstpos; - HashSet<int> m_lastpos; - - Dictionary<int, HashSet<int>> m_followpos = new Dictionary<int, HashSet<int>>(); - Dictionary<int, int> m_indexes = new Dictionary<int, int>(); - Dictionary<int, int> m_ends = new Dictionary<int, int>(); - - public Dictionary<int, HashSet<int>> FollowposMap { - get { return m_followpos; } - } - - public HashSet<int> Followpos(int pos) { - HashSet<int> set; - if (m_followpos.TryGetValue(pos, out set)) - return set; - return m_followpos[pos] = new HashSet<int>(); - } - - bool Nullable(object n) { - if (n is EmptyToken || n is StarToken) - return true; - if (n is AltToken) - return Nullable(((AltToken)n).Left) || Nullable(((AltToken)n).Right); - if (n is CatToken) - return Nullable(((CatToken)n).Left) && Nullable(((CatToken)n).Right); - return false; - } - - - public void Visit(AltToken token) { - if (m_root == null) - m_root = token; - var firtspos = new HashSet<int>(); - var lastpos = new HashSet<int>(); - - token.Left.Accept(this); - firtspos.UnionWith(m_firstpos); - lastpos.UnionWith(m_lastpos); - - token.Right.Accept(this); - firtspos.UnionWith(m_firstpos); - lastpos.UnionWith(m_lastpos); - - m_firstpos = firtspos; - m_lastpos = lastpos; - } - - public void Visit(StarToken token) { - if (m_root == null) - m_root = token; - token.Token.Accept(this); - - foreach (var i in m_lastpos) - Followpos(i).UnionWith(m_firstpos); - } - - public void Visit(CatToken token) { - if (m_root == null) - m_root = token; - - var firtspos = new HashSet<int>(); - var lastpos = new HashSet<int>(); - token.Left.Accept(this); - firtspos.UnionWith(m_firstpos); - var leftLastpos = m_lastpos; - - token.Right.Accept(this); - lastpos.UnionWith(m_lastpos); - var rightFirstpos = m_firstpos; - - if (Nullable(token.Left)) - firtspos.UnionWith(rightFirstpos); - - if (Nullable(token.Right)) - lastpos.UnionWith(leftLastpos); - - m_firstpos = firtspos; - m_lastpos = lastpos; - - foreach (var i in leftLastpos) - Followpos(i).UnionWith(rightFirstpos); - - } - - public void Visit(EmptyToken token) { - if (m_root == null) - m_root = token; - ; - } - - public void Visit(SymbolToken token) { - if (m_root == null) - m_root = token; - m_idx++; - m_indexes[m_idx] = token.Value; - m_firstpos = new HashSet<int>(new[] { m_idx }); - m_lastpos = new HashSet<int>(new[] { m_idx }); - } - - public void Visit(EndToken token) { - if (m_root == null) - m_root = token; - m_idx++; - m_indexes[m_idx] = IndexedAlphabetBase<char>.UNCLASSIFIED; - m_firstpos = new HashSet<int>(new[] { m_idx }); - m_lastpos = new HashSet<int>(new[] { m_idx }); - Followpos(m_idx); - m_ends.Add(m_idx, token.Tag); - } - - public void BuildDFA(IDFADefinition dfa) { - Safe.ArgumentNotNull(dfa,"dfa"); - - var stateMap = new Dictionary<HashSet<int>, int>(new CustomEqualityComparer<HashSet<int>>( - (x, y) => x.SetEquals(y), - (x) => x.Sum(n => n.GetHashCode()) - )); - - stateMap[m_firstpos] = DefineState( dfa, m_firstpos); - Debug.Assert(stateMap[m_firstpos] == DFADefinition.INITIAL_STATE); - - var queue = new Queue<HashSet<int>>(); - - queue.Enqueue(m_firstpos); - - while (queue.Count > 0) { - var state = queue.Dequeue(); - var s1 = stateMap[state]; - - for (int a = 0; a < dfa.AlphabetSize; a++) { - var next = new HashSet<int>(); - foreach (var p in state) { - if (m_indexes[p] == a) { - next.UnionWith(Followpos(p)); - } - } - if (next.Count > 0) { - int s2; - if (!stateMap.TryGetValue(next, out s2)) { - stateMap[next] = s2 = DefineState(dfa, next); - queue.Enqueue(next); - } - dfa.DefineTransition(s1, s2, a); - } - } - - } - } - - int[] GetStateTags(HashSet<int> state) { - Debug.Assert(state != null); - return state.Where(m_ends.ContainsKey).Select(pos => m_ends[pos]).ToArray(); - } - - int DefineState(IDFADefinition automa, HashSet<int> state) { - Debug.Assert(automa != null); - Debug.Assert(state != null); - - var tags = GetStateTags(state); - - return tags.Length > 0 ? automa.AddState(tags) : automa.AddState(); - } - - } -}
--- a/Implab/Parsing/DFADefinition.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,285 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Linq; - -namespace Implab.Parsing { - public class DFADefinition<TInput, TState, TTag> : IDFADefinition<TInput, TState, TTag> { - readonly List<DFAStateDescriptior<TTag>> m_states; - - public const int INITIAL_STATE = 1; - public const int UNREACHEBLE_STATE = 0; - - DFAStateDescriptior<TTag>[] m_statesArray; - readonly int m_alpabetSize; - - public DFADefinition(int alphabetSize) { - m_states = new List<DFAStateDescriptior<TTag>>(); - m_alpabetSize = alphabetSize; - - m_states.Add(new DFAStateDescriptior<TTag>()); - } - - public bool InitialStateIsFinal { - get { - return m_states[INITIAL_STATE].final; - } - } - - public int AddState() { - var index = m_states.Count; - m_states.Add(new DFAStateDescriptior<TTag> { - final = false, - transitions = new int[AlphabetSize] - }); - m_statesArray = null; - - return index; - } - - public int AddState(TTag[] tag) { - var index = m_states.Count; - bool final = tag != null && tag.Length != 0; - m_states.Add(new DFAStateDescriptior<TTag> { - final = final, - transitions = new int[AlphabetSize], - tag = final ? tag : null - }); - m_statesArray = null; - return index; - } - - public void DefineTransition(TState s1, TState s2, TInput symbol) { - int is1 = StateAlphabet.Translate(s1); - int is2 = StateAlphabet.Translate(s2); - int isym = InputAlphabet.Translate(symbol); - - Safe.ArgumentAssert(is1 != 0, "s1"); - Safe.ArgumentAssert(is2 != 0, "s2"); - Safe.ArgumentAssert(isym != 0, "symbol"); - - m_states[is1].transitions[isym] = is2; - } - - #region IDFADefinition implementation - - public DFAStateDescriptior<TTag>[] GetTransitionTable() { - if (m_statesArray == null) - m_statesArray = m_states.ToArray(); - return m_statesArray; - } - - public IAlphabet<TInput> InputAlphabet { - get { - throw new NotImplementedException(); - } - } - - public IAlphabet<TState> StateAlphabet { - get { - throw new NotImplementedException(); - } - } - - #endregion - - protected IDFADefinition<> Optimize<TA>(Func<IAlphabet<TA>, IDFADefinition> dfaFactory,IAlphabet<TA> sourceAlphabet, IAlphabet<TA> minimalAlphabet) { - Safe.ArgumentNotNull(dfaFactory, "dfaFactory"); - Safe.ArgumentNotNull(minimalAlphabet, "minimalAlphabet"); - - var setComparer = new CustomEqualityComparer<HashSet<int>>( - (x, y) => x.SetEquals(y), - (s) => s.Sum(x => x.GetHashCode()) - ); - - var arrayComparer = new CustomEqualityComparer<int[]>( - (x,y) => (new HashSet<int>(x)).SetEquals(new HashSet<int>(y)), - (a) => a.Sum(x => x.GetHashCode()) - ); - - var optimalStates = new HashSet<HashSet<int>>(setComparer); - var queue = new HashSet<HashSet<int>>(setComparer); - - foreach (var g in Enumerable - .Range(INITIAL_STATE, m_states.Count-1) - .Select(i => new { - index = i, - descriptor = m_states[i] - }) - .Where(x => x.descriptor.final) - .GroupBy(x => x.descriptor.tag, arrayComparer) - ) { - optimalStates.Add(new HashSet<int>(g.Select(x => x.index))); - } - - var state = new HashSet<int>( - Enumerable - .Range(INITIAL_STATE, m_states.Count - 1) - .Where(i => !m_states[i].final) - ); - optimalStates.Add(state); - queue.Add(state); - - while (queue.Count > 0) { - var stateA = queue.First(); - queue.Remove(stateA); - - for (int c = 0; c < AlphabetSize; c++) { - var stateX = new HashSet<int>(); - - for(int s = 1; s < m_states.Count; s++) { - if (stateA.Contains(m_states[s].transitions[c])) - stateX.Add(s); - } - - foreach (var stateY in optimalStates.ToArray()) { - if (stateX.Overlaps(stateY) && !stateY.IsSubsetOf(stateX)) { - var stateR1 = new HashSet<int>(stateY); - var stateR2 = new HashSet<int>(stateY); - - stateR1.IntersectWith(stateX); - stateR2.ExceptWith(stateX); - - optimalStates.Remove(stateY); - optimalStates.Add(stateR1); - optimalStates.Add(stateR2); - - if (queue.Contains(stateY)) { - queue.Remove(stateY); - queue.Add(stateR1); - queue.Add(stateR2); - } else { - queue.Add(stateR1.Count <= stateR2.Count ? stateR1 : stateR2); - } - } - } - } - } - - // строим карты соотвествия оптимальных состояний с оригинальными - - var initialState = optimalStates.Single(x => x.Contains(INITIAL_STATE)); - - // карта получения оптимального состояния по соотвествующему ему простому состоянию - int[] reveseOptimalMap = new int[m_states.Count]; - // карта с индексами оптимальных состояний - HashSet<int>[] optimalMap = new HashSet<int>[optimalStates.Count + 1]; - { - optimalMap[0] = new HashSet<int>(); // unreachable state - optimalMap[1] = initialState; // initial state - foreach (var ss in initialState) - reveseOptimalMap[ss] = 1; - - int i = 2; - foreach (var s in optimalStates) { - if (s.SetEquals(initialState)) - continue; - optimalMap[i] = s; - foreach (var ss in s) - reveseOptimalMap[ss] = i; - i++; - } - } - - // получаем минимальный алфавит - - var minClasses = new HashSet<HashSet<int>>(setComparer); - var alphaQueue = new Queue<HashSet<int>>(); - alphaQueue.Enqueue(new HashSet<int>(Enumerable.Range(0,AlphabetSize))); - - for (int s = 1 ; s < optimalMap.Length; s++) { - var newQueue = new Queue<HashSet<int>>(); - - foreach (var A in alphaQueue) { - if (A.Count == 1) { - minClasses.Add(A); - continue; - } - - // различаем классы символов, которые переводят в различные оптимальные состояния - // optimalState -> alphaClass - var classes = new Dictionary<int, HashSet<int>>(); - - foreach (var term in A) { - // ищем все переходы класса по символу term - var s2 = reveseOptimalMap[ - optimalMap[s].Select(x => m_states[x].transitions[term]).FirstOrDefault(x => x != 0) // первое допустимое элементарное состояние, если есть - ]; - - HashSet<int> A2; - if (!classes.TryGetValue(s2, out A2)) { - A2 = new HashSet<int>(); - newQueue.Enqueue(A2); - classes[s2] = A2; - } - A2.Add(term); - } - } - - if (newQueue.Count == 0) - break; - alphaQueue = newQueue; - } - - foreach (var A in alphaQueue) - minClasses.Add(A); - - var alphabetMap = sourceAlphabet.Reclassify(minimalAlphabet, minClasses); - - // построение автомата - - var minimalDFA = dfaFactory(minimalAlphabet); - - var states = new int[ optimalMap.Length ]; - states[0] = UNREACHEBLE_STATE; - - for(var s = INITIAL_STATE; s < states.Length; s++) { - var tags = optimalMap[s].SelectMany(x => m_states[x].tag ?? Enumerable.Empty<int>()).Distinct().ToArray(); - if (tags.Length > 0) - states[s] = minimalDFA.AddState(tags); - else - states[s] = minimalDFA.AddState(); - } - - Debug.Assert(states[INITIAL_STATE] == INITIAL_STATE); - - for (int s1 = 1; s1 < m_states.Count; s1++) { - for (int c = 0; c < AlphabetSize; c++) { - var s2 = m_states[s1].transitions[c]; - if (s2 != UNREACHEBLE_STATE) { - minimalDFA.DefineTransition( - reveseOptimalMap[s1], - reveseOptimalMap[s2], - alphabetMap[c] - ); - } - } - } - - return minimalDFA; - } - - public void PrintDFA<TA>(IAlphabet<TA> alphabet) { - - var reverseMap = alphabet.CreateReverseMap(); - - for (int i = 1; i < reverseMap.Length; i++) { - Console.WriteLine("C{0}: {1}", i, String.Join(",", reverseMap[i])); - } - - for (int i = 1; i < m_states.Count; i++) { - var s = m_states[i]; - for (int c = 0; c < AlphabetSize; c++) - if (s.transitions[c] != UNREACHEBLE_STATE) - Console.WriteLine("S{0} -{1}-> S{2}{3}", i, String.Join(",", reverseMap[c]), s.transitions[c], m_states[s.transitions[c]].final ? "$" : ""); - } - } - - public int AlphabetSize { - get { - return m_alpabetSize; - } - } - } -}
--- a/Implab/Parsing/DFAStateDescriptor.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - public struct DFAStateDescriptior<TTag> { - public bool final; - public TTag[] tag; - public int[] transitions; - } -}
--- a/Implab/Parsing/DFAutomaton.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - public abstract class DFAutomaton<T> { - protected struct ContextFrame { - public DFAStateDescriptior[] states; - public int current; - public T info; - } - - public const int INITIAL_STATE = DFADefinition.INITIAL_STATE; - public const int UNREACHEBLE_STATE = DFADefinition.UNREACHEBLE_STATE; - - protected ContextFrame m_context; - Stack<ContextFrame> m_contextStack = new Stack<ContextFrame>(); - - protected int Level { - get { return m_contextStack.Count; } - } - - protected DFAutomaton(DFAStateDescriptior[] states, int startState, T info) { - Safe.ArgumentNotNull(states, "states"); - Safe.ArgumentInRange(startState, 0, states.Length - 1, "startState"); - - m_context.states = states; - m_context.current = startState; - m_context.info = info; - } - - protected void Switch(DFAStateDescriptior[] states, int current, T info) { - Debug.Assert(states != null); - Debug.Assert(current >= 0 && current < states.Length); - m_contextStack.Push(m_context); - m_context.states = states; - m_context.current = current; - m_context.info = info; - } - - protected void Restore() { - Debug.Assert(m_contextStack.Count > 0); - - m_context = m_contextStack.Pop(); - } - - protected void Move(int input) { - Debug.Assert(input > 0 && input < m_context.states[m_context.current].transitions.Length); - m_context.current = m_context.states[m_context.current].transitions[input]; - } - - protected bool CanMove(int input) { - Debug.Assert(input > 0 && input < m_context.states[m_context.current].transitions.Length); - return m_context.states[m_context.current].transitions[input] != UNREACHEBLE_STATE; - } - } -}
--- a/Implab/Parsing/EDFADefinition.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -using Implab; -using System; - -namespace Implab.Parsing { - public class EDFADefinition<T> : DFADefinition where T : struct, IConvertible { - readonly EnumAlphabet<T> m_alphabet; - - public EnumAlphabet<T> Alphabet { - get { return m_alphabet; } - } - - public EDFADefinition(EnumAlphabet<T> alphabet) : base(alphabet.Count) { - m_alphabet = alphabet; - } - - public void DefineTransition(int s1, int s2, T input) { - DefineTransition(s1, s2, m_alphabet.Translate(input)); - } - - public EDFADefinition<T> Optimize() { - - return (EDFADefinition<T>)Optimize(alphabet => new EDFADefinition<T>((EnumAlphabet<T>)alphabet), m_alphabet, new EnumAlphabet<T>()); - } - - public void PrintDFA() { - PrintDFA(m_alphabet); - } - } -}
--- a/Implab/Parsing/EmptyToken.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - public class EmptyToken : Token { - public override void Accept(IVisitor visitor) { - Safe.ArgumentNotNull(visitor, "visitor"); - visitor.Visit(this); - } - public override string ToString() { - return "$"; - } - } -}
--- a/Implab/Parsing/EndToken.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// <summary> - /// Конечный символ расширенного регулярного выражения, при построении ДКА - /// используется для определения конечных состояний. - /// </summary> - public class EndToken: Token { - - int m_tag; - - public EndToken(int tag) { - m_tag = tag; - } - - public EndToken() - : this(0) { - } - - public int Tag { - get { return m_tag; } - } - - public override void Accept(IVisitor visitor) { - Safe.ArgumentNotNull(visitor, "visitor"); - visitor.Visit(this); - } - public override string ToString() { - return "#"; - } - } -}
--- a/Implab/Parsing/EnumAlphabet.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,67 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Globalization; -using System.Linq; -using System.Diagnostics.CodeAnalysis; - -namespace Implab.Parsing { - /// <summary> - /// Алфавит символами которого являются элементы перечислений. - /// </summary> - /// <typeparam name="T">Тип перечислений</typeparam> - public class EnumAlphabet<T> : IndexedAlphabetBase<T> where T : struct, IConvertible { - [SuppressMessage("Microsoft.Design", "CA1000:DoNotDeclareStaticMembersOnGenericTypes")] - static readonly T[] _symbols; - static readonly EnumAlphabet<T> _fullAlphabet; - - [SuppressMessage("Microsoft.Design", "CA1065:DoNotRaiseExceptionsInUnexpectedLocations")] - static EnumAlphabet() { - if (!typeof(T).IsEnum) - throw new InvalidOperationException("Invalid generic parameter, enumeration is required"); - - if (Enum.GetUnderlyingType(typeof(T)) != typeof(Int32)) - throw new InvalidOperationException("Only enums based on Int32 are supported"); - - _symbols = ((T[])Enum.GetValues(typeof(T))) - .OrderBy(x => x.ToInt32(CultureInfo.InvariantCulture)) - .ToArray(); - - if ( - _symbols[_symbols.Length - 1].ToInt32(CultureInfo.InvariantCulture) >= _symbols.Length - || _symbols[0].ToInt32(CultureInfo.InvariantCulture) != 0 - ) - throw new InvalidOperationException("The specified enumeration must be zero-based and continuously numbered"); - - _fullAlphabet = new EnumAlphabet<T>(_symbols.Select(x => x.ToInt32(CultureInfo.InvariantCulture)).ToArray()); - } - - - - public static EnumAlphabet<T> FullAlphabet { - get { - return _fullAlphabet; - } - } - - - public EnumAlphabet() - : base(_symbols.Length) { - } - - public EnumAlphabet(int[] map) - : base(map) { - Debug.Assert(map.Length == _symbols.Length); - } - - - public override int GetSymbolIndex(T symbol) { - return symbol.ToInt32(CultureInfo.InvariantCulture); - } - - public override IEnumerable<T> InputSymbols { - get { return _symbols; } - } - - } -}
--- a/Implab/Parsing/Grammar.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,108 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// <summary> - /// Базовый абстрактный класс. Грамматика, позволяет формулировать выражения над алфавитом типа <c>char</c>. - /// </summary> - /// <typeparam name="TGrammar"></typeparam> - public abstract class Grammar<TGrammar> where TGrammar: Grammar<TGrammar>, new() { - static TGrammar _instance; - - public static TGrammar Instance{ - get { - if (_instance == null) - _instance = new TGrammar(); - return _instance; - } - } - - readonly CharAlphabet m_alphabet = new CharAlphabet(); - - public CharAlphabet Alphabet { - get { return m_alphabet; } - } - - public SymbolToken UnclassifiedToken() { - return new SymbolToken(CharAlphabet.UNCLASSIFIED); - } - - public void DefineAlphabet(IEnumerable<char> alphabet) { - Safe.ArgumentNotNull(alphabet, "alphabet"); - - foreach (var ch in alphabet) - m_alphabet.DefineSymbol(ch); - } - public Token SymbolRangeToken(char start, char end) { - return SymbolToken(Enumerable.Range(start, end - start + 1).Select(x => (char)x)); - } - - public Token SymbolToken(char symbol) { - return Token.New(TranslateOrAdd(symbol)); - } - - public Token SymbolToken(IEnumerable<char> symbols) { - Safe.ArgumentNotNull(symbols, "symbols"); - - return Token.New(TranslateOrAdd(symbols).ToArray()); - } - - public Token SymbolSetToken(params char[] set) { - return SymbolToken(set); - } - - int TranslateOrAdd(char ch) { - var t = m_alphabet.Translate(ch); - if (t == CharAlphabet.UNCLASSIFIED) - t = m_alphabet.DefineSymbol(ch); - return t; - } - - IEnumerable<int> TranslateOrAdd(IEnumerable<char> symbols) { - return symbols.Distinct().Select(TranslateOrAdd); - } - - int TranslateOrDie(char ch) { - var t = m_alphabet.Translate(ch); - if (t == CharAlphabet.UNCLASSIFIED) - throw new ApplicationException(String.Format("Symbol '{0}' is UNCLASSIFIED", ch)); - return t; - } - - IEnumerable<int> TranslateOrDie(IEnumerable<char> symbols) { - return symbols.Distinct().Select(TranslateOrDie); - } - - public Token SymbolTokenExcept(IEnumerable<char> symbols) { - Safe.ArgumentNotNull(symbols, "symbols"); - - return Token.New( Enumerable.Range(0, m_alphabet.Count).Except(TranslateOrDie(symbols)).ToArray()); - } - - protected CDFADefinition BuildDFA(Token lang) { - Safe.ArgumentNotNull(lang, "lang"); - - var dfa = new CDFADefinition(m_alphabet); - - var builder = new DFABuilder(); - - lang.Accept( builder ); - - builder.BuildDFA(dfa); - if (dfa.InitialStateIsFinal) - throw new ApplicationException("The specified language contains empty token"); - - return dfa.Optimize(); - } - - - - //protected abstract TGrammar CreateInstance(); - } - - -}
--- a/Implab/Parsing/IAlphabet.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// <summary> - /// Алфавит. Множество символов, которые разбиты на классы, при этом классы имеют непрерывную нумерацию, - /// что позволяет использовать их в качестве индексов массивов. - /// </summary> - /// <remarks> - /// <para>Алфавит является сюрьективным отображением множества символов в множество индексов, это позволяет сократить размер таблицы переходов автомата - /// для входных символов, которые для него не различимы.</para> - /// </remarks> - /// <typeparam name="TSymbol">Тип символов.</typeparam> - public interface IAlphabet<TSymbol> { - /// <summary> - /// Количество классов символов в алфавите. - /// </summary> - int Count { get; } - - /// <summary> - /// Создает карту обратного сопоставления класса символов алфавита и сопоставленным - /// ему исходным символам. - /// </summary> - /// <returns></returns> - List<TSymbol>[] CreateReverseMap(); - - /// <summary> - /// Создает новый алфавит на основе текущего, горппируя его сиволы в более - /// крупные непересекающиеся классы символов. - /// </summary> - /// <param name="newAlphabet">Новый, пустой алфавит, в котором быдут определены классы.</param> - /// <param name="classes">Множество классов символов текущего алфавита.</param> - /// <returns>Карта для перехода классов текущего - /// алфавита к классам нового.</returns> - /// <remarks>Ползволяет укрупнить алфавит, объединив классы в текущем алфавите. Используется при оптимизации автомата.</remarks> - int[] Reclassify(IAlphabetBuilder<TSymbol> newAlphabet, IEnumerable<ICollection<int>> classes); - - /// <summary> - /// Преобразует входной символ в индекс символа из алфавита. - /// </summary> - /// <param name="symobl">Исходный символ</param> - /// <returns>Индекс в алфавите</returns> - int Translate(TSymbol symobl); - } -}
--- a/Implab/Parsing/IAlphabetBuilder.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -using System; -using System.Collections.Generic; - -namespace Implab.Parsing { - public interface IAlphabetBuilder<TSymbol> : IAlphabet<TSymbol> { - /// <summary> - /// Добавляет новый символ в алфавит, если символ уже был добавлен, то - /// возвращается ранее сопоставленный с символом класс. - /// </summary> - /// <param name="symbol">Символ для добавления.</param> - /// <returns>Индекс класса, который попоставлен с символом.</returns> - int DefineSymbol(TSymbol symbol); - /// <summary> - /// Доабвляем класс символов. Множеству указанных исходных символов - /// будет сопоставлен символ в алфавите. - /// </summary> - /// <param name="symbols">Множестов исходных символов</param> - /// <returns>Идентификатор символа алфавита.</returns> - int DefineClass(IEnumerable<TSymbol> symbols); - - - - } -} -
--- a/Implab/Parsing/IDFADefinition.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// <summary> - /// Полностью описывает DFA автомат, его поведение, состояние и входные символы. - /// </summary> - /// <example> - /// class MyAutomaton { - /// int m_current; - /// readonly DFAStateDescriptor<string>[] m_automaton; - /// readonly IAlphabet<MyCommands> m_commands; - /// - /// public MyAutomaton(IDFADefinition<MyCommands,MyStates,string> definition) { - /// m_current = definition.StateAlphabet.Translate(MyStates.Initial); - /// m_automaton = definition.GetTransitionTable(); - /// m_commands = definition.InputAlphabet; - /// } - /// - /// // defined a method which will move the automaton to the next state - /// public void Move(MyCommands cmd) { - /// // use transition map to determine the next state - /// var next = m_automaton[m_current].transitions[m_commands.Translate(cmd)]; - /// - /// // validate that we aren't in the unreachable state - /// if (next == DFAConst.UNREACHABLE_STATE) - /// throw new InvalidOperationException("The specified command is invalid"); - /// - /// // if everything is ok - /// m_current = next; - /// } - /// } - /// </example> - public interface IDFADefinition<TInput, TState, TTag> { - /// <summary> - /// Алфавит входных символов - /// </summary> - /// <value>The input alphabet.</value> - IAlphabet<TInput> InputAlphabet { - get; - } - - /// <summary> - /// Алфавит состояний автомата - /// </summary> - /// <value>The state alphabet.</value> - IAlphabet<TState> StateAlphabet { - get; - } - - /// <summary> - /// Таблица переходов состояний автомата - /// </summary> - /// <returns>The transition table.</returns> - DFAStateDescriptior<TTag>[] GetTransitionTable(); - - } -}
--- a/Implab/Parsing/IDFADefinitionBuilder.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -using System; - -namespace Implab.Parsing { - public interface IDFADefinitionBuilder<TInput, TState, TTag> : IDFADefinition<TInput, TState, TTag> { - - - } -} -
--- a/Implab/Parsing/IVisitor.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// <summary> - /// Интерфейс обходчика синтаксического дерева регулярного выражения - /// </summary> - public interface IVisitor { - void Visit(AltToken token); - void Visit(StarToken token); - void Visit(CatToken token); - void Visit(EmptyToken token); - void Visit(EndToken token); - void Visit(SymbolToken token); - } -}
--- a/Implab/Parsing/IndexedAlphabetBase.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,107 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// <summary> - /// Indexed alphabet is the finite set of symbols where each symbol has a zero-based unique index. - /// </summary> - public abstract class IndexedAlphabetBase<T> : IAlphabet<T> { - public const int UNCLASSIFIED = 0; - - int m_nextId = 1; - readonly int[] m_map; - - public int Count { - get { return m_nextId; } - } - - protected IndexedAlphabetBase(int mapSize) { - m_map = new int[mapSize]; - } - - protected IndexedAlphabetBase(int[] map) { - Debug.Assert(map != null); - - m_map = map; - m_nextId = map.Max() + 1; - } - - public int DefineSymbol(T symbol) { - var index = GetSymbolIndex(symbol); - if (m_map[index] == UNCLASSIFIED) - m_map[index] = m_nextId++; - return m_map[index]; - } - - public int DefineClass(IEnumerable<T> symbols) { - Safe.ArgumentNotNull(symbols, "symbols"); - symbols = symbols.Distinct(); - - foreach (var symbol in symbols) { - var index = GetSymbolIndex(symbol); - if (m_map[index] == UNCLASSIFIED) - m_map[GetSymbolIndex(symbol)] = m_nextId; - else - throw new InvalidOperationException(String.Format("Symbol '{0}' already in use", symbol)); - } - return m_nextId++; - } - - public List<T>[] CreateReverseMap() { - return - Enumerable.Range(UNCLASSIFIED, Count) - .Select( - i => InputSymbols - .Where(x => i != UNCLASSIFIED && m_map[GetSymbolIndex(x)] == i) - .ToList() - ) - .ToArray(); - } - - public int[] Reclassify(IAlphabet<T> newAlphabet, IEnumerable<ICollection<int>> classes) { - Safe.ArgumentNotNull(newAlphabet, "newAlphabet"); - Safe.ArgumentNotNull(classes, "classes"); - var reverseMap = CreateReverseMap(); - - int[] translationMap = new int[Count]; - - foreach (var scl in classes) { - // skip if the supper class contains the unclassified element - if (scl.Contains(UNCLASSIFIED)) - continue; - var range = new List<T>(); - foreach (var cl in scl) { - if (cl < 0 || cl >= reverseMap.Length) - throw new ArgumentOutOfRangeException(String.Format("Class {0} is not valid for the current alphabet", cl)); - range.AddRange(reverseMap[cl]); - } - var newClass = newAlphabet.DefineClass(range); - foreach (var cl in scl) - translationMap[cl] = newClass; - } - - return translationMap; - } - - public virtual int Translate(T symbol) { - return m_map[GetSymbolIndex(symbol)]; - } - - public abstract int GetSymbolIndex(T symbol); - - public abstract IEnumerable<T> InputSymbols { get; } - - /// <summary> - /// Gets the translation map from the index of the symbol to it's class this is usefull for the optimized input symbols transtaion. - /// </summary> - /// <returns>The translation map.</returns> - public int[] GetTranslationMap() { - return m_map; - } - } -}
--- a/Implab/Parsing/ParserException.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,17 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -namespace Implab.Parsing { - [Serializable] - public class ParserException : Exception { - public ParserException() { } - public ParserException(string message) : base(message) { } - public ParserException(string message, Exception inner) : base(message, inner) { } - protected ParserException( - System.Runtime.Serialization.SerializationInfo info, - System.Runtime.Serialization.StreamingContext context) - : base(info, context) { } - } -}
--- a/Implab/Parsing/Scanner.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,259 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.IO; -using Implab.Components; - -namespace Implab.Parsing { - /// <summary> - /// Базовый класс для разбора потока входных символов на токены. - /// </summary> - /// <remarks> - /// Сканнер имеет внутри буффер с симолами входного текста, по которому перемещаются два - /// указателя, начала и конца токена, при перемещении искользуется ДКА для определения - /// конца токена и допустимости текущего символа. - /// </remarks> - public abstract class Scanner : Disposable { - struct ScannerConfig { - public DFAStateDescriptior[] states; - public int[] alphabetMap; - } - - Stack<ScannerConfig> m_defs = new Stack<ScannerConfig>(); - - DFAStateDescriptior[] m_states; - int[] m_alphabetMap; - - protected DFAStateDescriptior m_currentState; - int m_previewCode; - - protected int m_tokenLen = 0; - protected int m_tokenOffset; - - protected char[] m_buffer; - protected int m_bufferSize; - protected int m_pointer; - - TextReader m_reader; - bool m_disposeReader; - int m_chunkSize = 1024; // 1k - int m_limit = 10 * 1024 * 1024; // 10Mb - - protected Scanner(DFAStateDescriptior[] states, int[] alphabet) { - Safe.ArgumentNotEmpty(states, "states"); - Safe.ArgumentNotNull(alphabet, "alphabet"); - - m_states = states; - m_alphabetMap = alphabet; - - Feed(new char[0]); - } - - /// <summary> - /// Заполняет входными данными буффер. - /// </summary> - /// <param name="data">Данные для обработки.</param> - /// <remarks>Копирование данных не происходит, переданный массив используется в - /// качестве входного буффера.</remarks> - public void Feed(char[] data) { - Safe.ArgumentNotNull(data, "data"); - - Feed(data, data.Length); - } - - /// <summary> - /// Заполняет буффур чтения входными данными. - /// </summary> - /// <param name="data">Данные для обработки.</param> - /// <param name="length">Длина данных для обработки.</param> - /// <remarks>Копирование данных не происходит, переданный массив используется в - /// качестве входного буффера.</remarks> - public void Feed(char[] data, int length) { - Safe.ArgumentNotNull(data, "data"); - Safe.ArgumentInRange(length, 0, data.Length, "length"); - AssertNotDisposed(); - - m_pointer = -1; - m_buffer = data; - m_bufferSize = length; - Shift(); - } - - public void Feed(TextReader reader, bool dispose) { - Safe.ArgumentNotNull(reader, "reader"); - AssertNotDisposed(); - - if (m_reader != null && m_disposeReader) - m_reader.Dispose(); - - m_reader = reader; - m_disposeReader = dispose; - m_pointer = -1; - m_buffer = new char[m_chunkSize]; - m_bufferSize = 0; - Shift(); - } - - /// <summary> - /// Получает текущий токен в виде строки. - /// </summary> - /// <returns></returns> - protected string GetTokenValue() { - return new String(m_buffer, m_tokenOffset, m_tokenLen); - } - - /// <summary> - /// Метки текущего токена, которые были назначены в регулярном выражении. - /// </summary> - protected int[] TokenTags { - get { - return m_currentState.tag; - } - } - - /// <summary> - /// Признак конца данных - /// </summary> - public bool EOF { - get { - return m_pointer >= m_bufferSize; - } - } - - /// <summary> - /// Читает следующий токен, при этом <see cref="m_tokenOffset"/> указывает на начало токена, - /// <see cref="m_tokenLen"/> на длину токена, <see cref="m_buffer"/> - массив символов, в - /// котором находится токен. - /// </summary> - /// <returns><c>false</c> - достигнут конец данных, токен не прочитан.</returns> - protected bool ReadTokenInternal() { - if (m_pointer >= m_bufferSize) - return false; - - m_currentState = m_states[DFADefinition.INITIAL_STATE]; - m_tokenLen = 0; - m_tokenOffset = m_pointer; - int nextState; - do { - nextState = m_currentState.transitions[m_previewCode]; - if (nextState == DFADefinition.UNREACHEBLE_STATE) { - if (m_currentState.final) - return true; - else - throw new ParserException( - String.Format( - "Unexpected symbol '{0}', at pos {1}", - m_buffer[m_pointer], - Position - ) - ); - } else { - m_currentState = m_states[nextState]; - m_tokenLen++; - } - - } while (Shift()); - - // END OF DATA - if (!m_currentState.final) - throw new ParserException("Unexpected end of data"); - - return true; - } - - - bool Shift() { - m_pointer++; - - if (m_pointer >= m_bufferSize) { - if (!ReadNextChunk()) - return false; - } - - m_previewCode = m_alphabetMap[m_buffer[m_pointer]]; - - return true; - } - - bool ReadNextChunk() { - if (m_reader == null) - return false; - - // extend buffer if nesessary - if (m_pointer + m_chunkSize > m_buffer.Length) { - // trim unused buffer head - var size = m_tokenLen + m_chunkSize; - if (size >= m_limit) - throw new ParserException(String.Format("Input buffer {0} bytes limit exceeded", m_limit)); - var temp = new char[size]; - Array.Copy(m_buffer, m_tokenOffset, temp, 0, m_tokenLen); - m_pointer -= m_tokenOffset; - m_bufferSize -= m_tokenOffset; - m_tokenOffset = 0; - m_buffer = temp; - } - - var read = m_reader.Read(m_buffer, m_tokenLen, m_chunkSize); - if (read == 0) - return false; - - m_bufferSize += read; - - return true; - } - - /// <summary> - /// Позиция сканнера во входном буфере - /// </summary> - public int Position { - get { - return m_pointer + 1; - } - } - - /// <summary> - /// Преключает внутренний ДКА на указанный, позволяет реализовать подобие захватывающей - /// группировки. - /// </summary> - /// <param name="states">Таблица состояний нового ДКА</param> - /// <param name="alphabet">Таблица входных символов для нового ДКА</param> - protected void Switch(DFAStateDescriptior[] states, int[] alphabet) { - Safe.ArgumentNotNull(states, "dfa"); - - m_defs.Push(new ScannerConfig { - states = m_states, - alphabetMap = m_alphabetMap - }); - - m_states = states; - m_alphabetMap = alphabet; - - m_previewCode = m_alphabetMap[m_buffer[m_pointer]]; - } - - /// <summary> - /// Восстанавливает предыдущей ДКА сканнера. - /// </summary> - protected void Restore() { - if (m_defs.Count == 0) - throw new InvalidOperationException(); - var prev = m_defs.Pop(); - m_states = prev.states; - m_alphabetMap = prev.alphabetMap; - m_previewCode = m_alphabetMap[m_buffer[m_pointer]]; - } - - protected override void Dispose(bool disposing) { - if (disposing) { - if (m_reader != null && m_disposeReader) - m_reader.Dispose(); - m_buffer = null; - m_bufferSize = 0; - m_pointer = 0; - m_tokenLen = 0; - m_tokenOffset = 0; - } - base.Dispose(disposing); - } - } -}
--- a/Implab/Parsing/StarToken.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,34 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// <summary> - /// Замыкание выражения с 0 и более повторов. - /// </summary> - public class StarToken: Token { - - Token m_token; - - public Token Token { - get { return m_token; } - } - - public StarToken(Token token) { - Safe.ArgumentNotNull(token, "token"); - m_token = token; - } - - public override void Accept(IVisitor visitor) { - Safe.ArgumentNotNull(visitor, "visitor"); - visitor.Visit(this); - } - - public override string ToString() { - return String.Format("({0})*", Token.ToString()); - } - } -}
--- a/Implab/Parsing/SymbolToken.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// <summary> - /// Выражение, соответсвующее одному символу. - /// </summary> - public class SymbolToken : Token { - int m_value; - - public int Value { - get { return m_value; } - } - - public SymbolToken(int value) { - m_value = value; - } - public override void Accept(IVisitor visitor) { - Safe.ArgumentNotNull(visitor, "visitor"); - - visitor.Visit(this); - - } - - public override string ToString() { - return Value.ToString(); - } - } -}
--- a/Implab/Parsing/Token.cs Fri Feb 19 18:07:17 2016 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,67 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Globalization; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - public abstract class Token { - public abstract void Accept(IVisitor visitor); - - public Token Extend() { - return new CatToken(this, new EndToken()); - } - - public Token Tag<T>(T tag) where T : IConvertible { - return new CatToken(this, new EndToken(tag.ToInt32(CultureInfo.InvariantCulture))); - } - - public Token Cat(Token right) { - return new CatToken(this, right); - } - - public Token Or(Token right) { - return new AltToken(this, right); - } - - public Token Optional() { - return Or(new EmptyToken()); - } - - public Token EClosure() { - return new StarToken(this); - } - - public Token Closure() { - return new CatToken(this, new StarToken(this)); - } - - public Token Repeat(int count) { - Token token = null; - - for (int i = 0; i < count; i++) - token = token != null ? token.Cat(this) : this; - return token ?? new EmptyToken(); - } - - public Token Repeat(int min, int max) { - if (min > max || min < 1) - throw new ArgumentOutOfRangeException(); - var token = Repeat(min); - - for (int i = min; i < max; i++) - token = token.Cat( this.Optional() ); - return token; - } - - public static Token New<T>(params T[] set) where T : struct, IConvertible { - Safe.ArgumentNotNull(set, "set"); - Token token = null; - foreach(var c in set.Distinct()) - token = token == null ? new SymbolToken(c.ToInt32(CultureInfo.InvariantCulture)) : token.Or(new SymbolToken(c.ToInt32(CultureInfo.InvariantCulture))); - return token; - } - } -}