# HG changeset patch # User cin # Date 2016-02-24 05:39:53 # Node ID 0526412bbb260d1e70e4413059b1f09813aa369b # Parent 2a8466f0cb8aacae858116af47493d1126bfeecc DFA refactoring diff --git a/Implab/Automaton/AutomatonTransition.cs b/Implab/Automaton/AutomatonTransition.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/AutomatonTransition.cs @@ -0,0 +1,33 @@ +using System; + +namespace Implab.Automaton { + struct AutomatonTransition : IEquatable { + public readonly int s1; + public readonly int s2; + public readonly int edge; + + public AutomatonTransition(int s1, int s2, int edge) { + this.s1 = s1; + this.s2 = s2; + this.edge = edge; + } + + + #region IEquatable implementation + public bool Equals(AutomatonTransition other) { + return other.s1 == s1 && other.s2 == s2 && other.edge == edge ; + } + #endregion + + public override bool Equals(object obj) { + if (obj is AutomatonTransition) + return Equals((AutomatonTransition)obj); + return base.Equals(obj); + } + + public override int GetHashCode() { + return s1 + s2 + edge; + } + } +} + diff --git a/Implab/Automaton/ByteAlphabet.cs b/Implab/Automaton/ByteAlphabet.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/ByteAlphabet.cs @@ -0,0 +1,23 @@ +using System; + +namespace Implab.Automaton { + public class ByteAlphabet : IndexedAlphabetBase { + public ByteAlphabet() : base(byte.MaxValue + 1){ + } + + #region implemented abstract members of IndexedAlphabetBase + + public override int GetSymbolIndex(byte symbol) { + return (int)symbol; + } + + public override System.Collections.Generic.IEnumerable InputSymbols { + get { + throw new NotImplementedException(); + } + } + + #endregion + } +} + diff --git a/Implab/Automaton/CDFADefinition.cs b/Implab/Automaton/CDFADefinition.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/CDFADefinition.cs @@ -0,0 +1,22 @@ +namespace Implab.Automaton { + public class CDFADefinition : DFADefinition { + readonly CharAlphabet m_alphabet; + + public CharAlphabet Alphabet { + get { return m_alphabet; } + } + + public CDFADefinition(CharAlphabet alphabet): base(alphabet.Count) { + m_alphabet = alphabet; + } + + public CDFADefinition Optimize() { + + return (CDFADefinition)Optimize(alphabet => new CDFADefinition((CharAlphabet)alphabet), m_alphabet, new CharAlphabet()); + } + + public void PrintDFA() { + PrintDFA(m_alphabet); + } + } +} diff --git a/Implab/Automaton/CharAlphabet.cs b/Implab/Automaton/CharAlphabet.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/CharAlphabet.cs @@ -0,0 +1,23 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Implab.Automaton { + public class CharAlphabet: IndexedAlphabetBase { + + public CharAlphabet() + : base(char.MaxValue + 1) { + } + + public override int GetSymbolIndex(char symbol) { + return symbol; + } + + public override IEnumerable InputSymbols { + get { return Enumerable.Range(char.MinValue, char.MaxValue).Select(x => (char)x); } + } + } +} diff --git a/Implab/Automaton/DFAConst.cs b/Implab/Automaton/DFAConst.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/DFAConst.cs @@ -0,0 +1,10 @@ +using System; + +namespace Implab.Automaton { + public static class DFAConst { + public const int UNREACHABLE_STATE = -1; + + public const int UNCLASSIFIED_INPUT = 0; + } +} + diff --git a/Implab/Automaton/DFADefinition.cs b/Implab/Automaton/DFADefinition.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/DFADefinition.cs @@ -0,0 +1,213 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Implab.Automaton { + public class DFADefinition : IDFADefinitionBuilder, IDFADefinition { + DFAStateDescriptior[] m_dfaTable; + readonly IAlphabet m_inputAlphabet; + readonly IAlphabet m_stateAlphabet; + + readonly Dictionary m_finalStates = new Dictionary(); + readonly HashSet m_transitions = new HashSet(); + + public DFADefinition(IAlphabet inputAlphabet, IAlphabet stateAlphabet) { + Safe.ArgumentNotNull(inputAlphabet, "inputAlphabet"); + Safe.ArgumentNotNull(stateAlphabet, "stateAlphabet"); + + m_inputAlphabet = inputAlphabet; + m_stateAlphabet = stateAlphabet; + } + + public void DefineTransition(int s1, int s2, int symbol) { + Safe.ArgumentInRange(s1, 0, m_stateAlphabet.Count-1, "s1"); + Safe.ArgumentInRange(s2, 0, m_stateAlphabet.Count-1, "s2"); + Safe.ArgumentInRange(symbol, 0, m_inputAlphabet.Count-1, "symbol"); + + m_transitions.Add(new AutomatonTransition(s1, s2, symbol)); + } + + + #region IDFADefinition implementation + + public DFAStateDescriptior[] GetTransitionTable() { + if (m_dfaTable == null) { + m_dfaTable = new DFAStateDescriptior[m_stateAlphabet.Count]; + + foreach (var pair in m_finalStates) { + var idx = pair.Key; + + m_dfaTable[idx].final = true; + m_dfaTable[idx].tag = m_dfaTable[idx].tag != null ? + m_dfaTable[idx].tag.Concat(pair.Value).Distinct().ToArray() : + pair.Value; + } + + foreach (var t in m_transitions) { + if (m_dfaTable[t.s1].transitions == null) { + m_dfaTable[t.s1].transitions = new int[m_inputAlphabet.Count]; + for (int i = 0; i < m_dfaTable[t.s1].transitions.Length; i++) + m_dfaTable[t.s1].transitions[i] = DFAConst.UNREACHABLE_STATE; + } + + m_dfaTable[t.s1].transitions[t.edge] = t.s2; + } + } + return m_dfaTable; + } + + public IAlphabet InputAlphabet { + get { + return m_inputAlphabet; + } + } + + public IAlphabet StateAlphabet { + get { + return m_stateAlphabet; + } + } + + #endregion + + #region IDFADefinitionBuilder + + public void DefineTransition(int s1, int s2, int symbol) { + Safe.ArgumentInRange(s1, 0, m_stateAlphabet.Count - 1, "s1"); + Safe.ArgumentInRange(s2, 0, m_stateAlphabet.Count - 1, "s2"); + Safe.ArgumentInRange(symbol, 0, m_inputAlphabet.Count - 1, "symbol"); + + m_transitions.Add(new AutomatonTransition(s1, s2, symbol)); + } + + public void MarkFinalState(int state, params TTag[] tags) { + m_finalStates[state] = tags; + } + + + #endregion + + protected void Optimize(IDFADefinitionBuilder optimalDFA,IAlphabetBuilder optimalInputAlphabet, IAlphabetBuilder optimalStateAlphabet) { + Safe.ArgumentNotNull(optimalDFA, "dfa"); + Safe.ArgumentNotNull(optimalInputAlphabet, "optimalInputAlphabet"); + Safe.ArgumentNotNull(optimalStateAlphabet, "optimalStateAlphabet"); + + var setComparer = new CustomEqualityComparer>( + (x, y) => x.SetEquals(y), + s => s.Sum(x => x.GetHashCode()) + ); + + var arrayComparer = new CustomEqualityComparer( + (x,y) => (new HashSet(x)).SetEquals(new HashSet(y)), + a => a.Sum(x => x.GetHashCode()) + ); + + var optimalStates = new HashSet>(setComparer); + var queue = new HashSet>(setComparer); + + // получаем конечные состояния, сгруппированные по маркерам + optimalStates.UnionWith( + m_finalStates + .GroupBy(pair => pair.Value, arrayComparer) + .Select( + g => new HashSet( + g.Select( pair => pair.Key) + ) + ) + ); + + var state = new HashSet( + Enumerable + .Range(0, m_stateAlphabet.Count - 1) + .Where(i => !m_finalStates.ContainsKey(i)) + ); + optimalStates.Add(state); + queue.Add(state); + + var rmap = m_transitions + .GroupBy(t => t.s2) + .ToLookup( + g => g.Key, // s2 + g => g.ToLookup(t => t.edge, t => t.s1) + ); + + while (queue.Count > 0) { + var stateA = queue.First(); + queue.Remove(stateA); + + for (int c = 0; c < m_inputAlphabet.Count; c++) { + var stateX = new HashSet(); + foreach(var a in stateA) + stateX.UnionWith(rmap[a][c]); // all states from wich 'c' leads to 'a' + + foreach (var stateY in optimalStates.ToArray()) { + if (stateX.Overlaps(stateY) && !stateY.IsSubsetOf(stateX)) { + var stateR1 = new HashSet(stateY); + var stateR2 = new HashSet(stateY); + + stateR1.IntersectWith(stateX); + stateR2.ExceptWith(stateX); + + optimalStates.Remove(stateY); + optimalStates.Add(stateR1); + optimalStates.Add(stateR2); + + if (queue.Contains(stateY)) { + queue.Remove(stateY); + queue.Add(stateR1); + queue.Add(stateR2); + } else { + queue.Add(stateR1.Count <= stateR2.Count ? stateR1 : stateR2); + } + } + } + } + } + + // карта получения оптимального состояния по соотвествующему ему простому состоянию + var statesMap = m_stateAlphabet.Reclassify(optimalStateAlphabet, optimalStates); + + // получаем минимальный алфавит + // входные символы не различимы, если Move(s,a1) == Move(s,a2) + var optimalAlphabet = m_transitions + .GroupBy(t => Tuple.Create(statesMap[t.s1], statesMap[t.s2]), t => t.edge); + + var alphabetMap = m_inputAlphabet.Reclassify(optimalInputAlphabet, optimalAlphabet); + + var optimalTags = m_finalStates + .GroupBy(pair => statesMap[pair.Key]) + .ToDictionary( + g => g.Key, + g => g.SelectMany(pair => pair.Value).ToArray() + ); + + // построение автомата + foreach (var pair in optimalTags) + optimalDFA.MarkFinalState(pair.Key, pair.Value); + + foreach (var t in m_transitions.Select(t => new AutomatonTransition(statesMap[t.s1],statesMap[t.s2],alphabetMap[t.edge])).Distinct()) + optimalDFA.DefineTransition(t.s1, t.s2, t.edge); + } + + public void PrintDFA() { + + var inputMap = InputAlphabet.CreateReverseMap(); + var stateMap = StateAlphabet.CreateReverseMap(); + + for (int i = 0; i < inputMap.Length; i++) { + Console.WriteLine("C{0}: {1}", i, String.Join(",", inputMap[i])); + } + + foreach(var t in m_transitions) + Console.WriteLine( + "S{0} -{1}-> S{2}{3}", + stateMap[t.s1], + String.Join(",", inputMap[t.edge]), + stateMap[t.s2], + m_finalStates.ContainsKey(t.s2) ? "$" : "" + ); + + } + } +} diff --git a/Implab/Automaton/DFAStateDescriptor.cs b/Implab/Automaton/DFAStateDescriptor.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/DFAStateDescriptor.cs @@ -0,0 +1,13 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Implab.Automaton { + public struct DFAStateDescriptior { + public bool final; + public TTag[] tag; + public int[] transitions; + } +} diff --git a/Implab/Automaton/DFAutomaton.cs b/Implab/Automaton/DFAutomaton.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/DFAutomaton.cs @@ -0,0 +1,61 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Implab.Automaton { + public abstract class DFAutomaton { + protected struct ContextFrame { + public DFAStateDescriptior[] states; + public int current; + public T info; + } + + public const int INITIAL_STATE = DFADefinition.INITIAL_STATE; + public const int UNREACHEBLE_STATE = DFADefinition.UNREACHEBLE_STATE; + + protected ContextFrame m_context; + Stack m_contextStack = new Stack(); + + protected int Level { + get { return m_contextStack.Count; } + } + + protected DFAutomaton(DFAStateDescriptior[] states, int startState, T info) { + Safe.ArgumentNotNull(states, "states"); + Safe.ArgumentInRange(startState, 0, states.Length - 1, "startState"); + + m_context.states = states; + m_context.current = startState; + m_context.info = info; + } + + protected void Switch(DFAStateDescriptior[] states, int current, T info) { + Debug.Assert(states != null); + Debug.Assert(current >= 0 && current < states.Length); + m_contextStack.Push(m_context); + m_context.states = states; + m_context.current = current; + m_context.info = info; + } + + protected void Restore() { + Debug.Assert(m_contextStack.Count > 0); + + m_context = m_contextStack.Pop(); + } + + protected void Move(int input) { + Debug.Assert(input > 0 && input < m_context.states[m_context.current].transitions.Length); + m_context.current = m_context.states[m_context.current].transitions[input]; + } + + protected bool CanMove(int input) { + Debug.Assert(input > 0 && input < m_context.states[m_context.current].transitions.Length); + return m_context.states[m_context.current].transitions[input] != UNREACHEBLE_STATE; + } + } +} diff --git a/Implab/Automaton/EDFADefinition.cs b/Implab/Automaton/EDFADefinition.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/EDFADefinition.cs @@ -0,0 +1,29 @@ +using Implab; +using System; + +namespace Implab.Parsing { + public class EDFADefinition : DFADefinition where T : struct, IConvertible { + readonly EnumAlphabet m_alphabet; + + public EnumAlphabet Alphabet { + get { return m_alphabet; } + } + + public EDFADefinition(EnumAlphabet alphabet) : base(alphabet.Count) { + m_alphabet = alphabet; + } + + public void DefineTransition(int s1, int s2, T input) { + DefineTransition(s1, s2, m_alphabet.Translate(input)); + } + + public EDFADefinition Optimize() { + + return (EDFADefinition)Optimize(alphabet => new EDFADefinition((EnumAlphabet)alphabet), m_alphabet, new EnumAlphabet()); + } + + public void PrintDFA() { + PrintDFA(m_alphabet); + } + } +} diff --git a/Implab/Automaton/EnumAlphabet.cs b/Implab/Automaton/EnumAlphabet.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/EnumAlphabet.cs @@ -0,0 +1,67 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; +using System.Linq; +using System.Diagnostics.CodeAnalysis; + +namespace Implab.Automaton { + /// + /// Алфавит символами которого являются элементы перечислений. + /// + /// Тип перечислений + public class EnumAlphabet : IndexedAlphabetBase where T : struct, IConvertible { + [SuppressMessage("Microsoft.Design", "CA1000:DoNotDeclareStaticMembersOnGenericTypes")] + static readonly T[] _symbols; + static readonly EnumAlphabet _fullAlphabet; + + [SuppressMessage("Microsoft.Design", "CA1065:DoNotRaiseExceptionsInUnexpectedLocations")] + static EnumAlphabet() { + if (!typeof(T).IsEnum) + throw new InvalidOperationException("Invalid generic parameter, enumeration is required"); + + if (Enum.GetUnderlyingType(typeof(T)) != typeof(Int32)) + throw new InvalidOperationException("Only enums based on Int32 are supported"); + + _symbols = ((T[])Enum.GetValues(typeof(T))) + .OrderBy(x => x.ToInt32(CultureInfo.InvariantCulture)) + .ToArray(); + + if ( + _symbols[_symbols.Length - 1].ToInt32(CultureInfo.InvariantCulture) >= _symbols.Length + || _symbols[0].ToInt32(CultureInfo.InvariantCulture) != 0 + ) + throw new InvalidOperationException("The specified enumeration must be zero-based and continuously numbered"); + + _fullAlphabet = new EnumAlphabet(_symbols.Select(x => x.ToInt32(CultureInfo.InvariantCulture)).ToArray()); + } + + + + public static EnumAlphabet FullAlphabet { + get { + return _fullAlphabet; + } + } + + + public EnumAlphabet() + : base(_symbols.Length) { + } + + public EnumAlphabet(int[] map) + : base(map) { + Debug.Assert(map.Length == _symbols.Length); + } + + + public override int GetSymbolIndex(T symbol) { + return symbol.ToInt32(CultureInfo.InvariantCulture); + } + + public override IEnumerable InputSymbols { + get { return _symbols; } + } + + } +} diff --git a/Implab/Automaton/IAlphabet.cs b/Implab/Automaton/IAlphabet.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/IAlphabet.cs @@ -0,0 +1,48 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Implab.Automaton { + /// + /// Алфавит. Множество символов, которые разбиты на классы, при этом классы имеют непрерывную нумерацию, + /// что позволяет использовать их в качестве индексов массивов. + /// + /// + /// Алфавит является сюрьективным отображением множества символов в множество индексов, это позволяет сократить размер таблицы переходов автомата + /// для входных символов, которые для него не различимы. + /// + /// Тип символов. + public interface IAlphabet { + /// + /// Количество классов символов в алфавите. + /// + int Count { get; } + + /// + /// Создает карту обратного сопоставления класса символов алфавита и сопоставленным + /// ему исходным символам. + /// + /// + List[] CreateReverseMap(); + + /// + /// Создает новый алфавит на основе текущего, горппируя его сиволы в более + /// крупные непересекающиеся классы символов. + /// + /// Новый, пустой алфавит, в котором быдут определены классы. + /// Множество классов символов текущего алфавита. + /// Карта для перехода классов текущего + /// алфавита к классам нового. + /// Ползволяет укрупнить алфавит, объединив классы в текущем алфавите. Используется при оптимизации автомата. + int[] Reclassify(IAlphabetBuilder newAlphabet, IEnumerable> classes); + + /// + /// Преобразует входной символ в индекс символа из алфавита. + /// + /// Исходный символ + /// Индекс в алфавите + int Translate(TSymbol symobl); + } +} diff --git a/Implab/Automaton/IAlphabetBuilder.cs b/Implab/Automaton/IAlphabetBuilder.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/IAlphabetBuilder.cs @@ -0,0 +1,25 @@ + +using System.Collections.Generic; + +namespace Implab.Automaton { + public interface IAlphabetBuilder : IAlphabet { + /// + /// Добавляет новый символ в алфавит, если символ уже был добавлен, то + /// возвращается ранее сопоставленный с символом класс. + /// + /// Символ для добавления. + /// Индекс класса, который попоставлен с символом. + int DefineSymbol(TSymbol symbol); + /// + /// Доабвляем класс символов. Множеству указанных исходных символов + /// будет сопоставлен символ в алфавите. + /// + /// Множестов исходных символов + /// Идентификатор символа алфавита. + int DefineClass(IEnumerable symbols); + + + + } +} + diff --git a/Implab/Automaton/IDFADefinition.cs b/Implab/Automaton/IDFADefinition.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/IDFADefinition.cs @@ -0,0 +1,56 @@ + +namespace Implab.Automaton { + /// + /// Полностью описывает DFA автомат, его поведение, состояние и входные символы. + /// + /// + /// class MyAutomaton { + /// int m_current; + /// readonly DFAStateDescriptor[] m_automaton; + /// readonly IAlphabet m_commands; + /// + /// public MyAutomaton(IDFADefinition<MyCommands,MyStates,string> definition) { + /// m_current = definition.StateAlphabet.Translate(MyStates.Initial); + /// m_automaton = definition.GetTransitionTable(); + /// m_commands = definition.InputAlphabet; + /// } + /// + /// // defined a method which will move the automaton to the next state + /// public void Move(MyCommands cmd) { + /// // use transition map to determine the next state + /// var next = m_automaton[m_current].transitions[m_commands.Translate(cmd)]; + /// + /// // validate that we aren't in the unreachable state + /// if (next == DFAConst.UNREACHABLE_STATE) + /// throw new InvalidOperationException("The specified command is invalid"); + /// + /// // if everything is ok + /// m_current = next; + /// } + /// } + /// + public interface IDFADefinition { + /// + /// Алфавит входных символов + /// + /// The input alphabet. + IAlphabet InputAlphabet { + get; + } + + /// + /// Алфавит состояний автомата + /// + /// The state alphabet. + IAlphabet StateAlphabet { + get; + } + + /// + /// Таблица переходов состояний автомата + /// + /// The transition table. + DFAStateDescriptior[] GetTransitionTable(); + + } +} diff --git a/Implab/Automaton/IDFADefinitionBuilder.cs b/Implab/Automaton/IDFADefinitionBuilder.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/IDFADefinitionBuilder.cs @@ -0,0 +1,23 @@ +using System; + +namespace Implab.Automaton { + public interface IDFADefinitionBuilder { + /// + /// Marks the state as final and assings tags. + /// + /// State. + /// Tags. + void MarkFinalState(int state, params TTag[] tags); + + /// + /// Defines the transition from to + /// with input . + /// + /// S1. + /// S2. + /// Symbol. + void DefineTransition(int s1, int s2, int symbol); + + } +} + diff --git a/Implab/Automaton/IndexedAlphabetBase.cs b/Implab/Automaton/IndexedAlphabetBase.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/IndexedAlphabetBase.cs @@ -0,0 +1,105 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; + +namespace Implab.Automaton { + /// + /// Indexed alphabet is the finite set of symbols where each symbol has a zero-based unique index. + /// + public abstract class IndexedAlphabetBase : IAlphabetBuilder { + public const int UNCLASSIFIED = 0; + + int m_nextId = 1; + readonly int[] m_map; + + public int Count { + get { return m_nextId; } + } + + protected IndexedAlphabetBase(int mapSize) { + m_map = new int[mapSize]; + } + + protected IndexedAlphabetBase(int[] map) { + Debug.Assert(map != null); + + m_map = map; + m_nextId = map.Max() + 1; + } + + public int DefineSymbol(T symbol) { + var index = GetSymbolIndex(symbol); + if (m_map[index] == UNCLASSIFIED) + m_map[index] = m_nextId++; + return m_map[index]; + } + + public int DefineClass(IEnumerable symbols) { + Safe.ArgumentNotNull(symbols, "symbols"); + symbols = symbols.Distinct(); + + foreach (var symbol in symbols) { + var index = GetSymbolIndex(symbol); + if (m_map[index] == UNCLASSIFIED) + m_map[GetSymbolIndex(symbol)] = m_nextId; + else + throw new InvalidOperationException(String.Format("Symbol '{0}' already in use", symbol)); + } + return m_nextId++; + } + + public List[] CreateReverseMap() { + return + Enumerable.Range(UNCLASSIFIED, Count) + .Select( + i => InputSymbols + .Where(x => i != UNCLASSIFIED && m_map[GetSymbolIndex(x)] == i) + .ToList() + ) + .ToArray(); + } + + public int[] Reclassify(IAlphabetBuilder newAlphabet, IEnumerable> classes) { + Safe.ArgumentNotNull(newAlphabet, "newAlphabet"); + Safe.ArgumentNotNull(classes, "classes"); + var reverseMap = CreateReverseMap(); + + var translationMap = new int[Count]; + + foreach (var scl in classes) { + // skip if the supper class contains the unclassified element + if (scl.Contains(UNCLASSIFIED)) + continue; + var range = new List(); + foreach (var cl in scl) { + if (cl < 0 || cl >= reverseMap.Length) + throw new ArgumentOutOfRangeException(String.Format("Class {0} is not valid for the current alphabet", cl)); + range.AddRange(reverseMap[cl]); + } + var newClass = newAlphabet.DefineClass(range); + foreach (var cl in scl) + translationMap[cl] = newClass; + } + + return translationMap; + } + + public virtual int Translate(T symbol) { + return m_map[GetSymbolIndex(symbol)]; + } + + public abstract int GetSymbolIndex(T symbol); + + public abstract IEnumerable InputSymbols { get; } + + /// + /// Gets the translation map from the index of the symbol to it's class this is usefull for the optimized input symbols transtaion. + /// + /// The translation map. + public int[] GetTranslationMap() { + return m_map; + } + } +} diff --git a/Implab/Automaton/ParserException.cs b/Implab/Automaton/ParserException.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/ParserException.cs @@ -0,0 +1,17 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Implab.Automaton { + [Serializable] + public class ParserException : Exception { + public ParserException() { } + public ParserException(string message) : base(message) { } + public ParserException(string message, Exception inner) : base(message, inner) { } + protected ParserException( + System.Runtime.Serialization.SerializationInfo info, + System.Runtime.Serialization.StreamingContext context) + : base(info, context) { } + } +} diff --git a/Implab/Automaton/RegularExpressions/AltToken.cs b/Implab/Automaton/RegularExpressions/AltToken.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/RegularExpressions/AltToken.cs @@ -0,0 +1,17 @@ +using System; + +namespace Implab.Automaton.RegularExpressions { + public class AltToken: BinaryToken { + public AltToken(Token left, Token right) + : base(left, right) { + } + + public override void Accept(IVisitor visitor) { + Safe.ArgumentNotNull(visitor, "visitor"); + visitor.Visit(this); + } + public override string ToString() { + return String.Format(Right is BinaryToken ? "{0}|({1})" : "{0}|{1}", Left, Right); + } + } +} diff --git a/Implab/Automaton/RegularExpressions/BinaryToken.cs b/Implab/Automaton/RegularExpressions/BinaryToken.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/RegularExpressions/BinaryToken.cs @@ -0,0 +1,21 @@ +using Implab; + +namespace Implab.Automaton.RegularExpressions { + public abstract class BinaryToken : Token { + readonly Token m_left; + readonly Token m_right; + + public Token Left { + get { return m_left; } + } + + public Token Right { + get { return m_right; } + } + + protected BinaryToken(Token left, Token right) { + Safe.ArgumentNotNull(m_left = left, "left"); + Safe.ArgumentNotNull(m_right = right, "right"); + } + } +} diff --git a/Implab/Automaton/RegularExpressions/CatToken.cs b/Implab/Automaton/RegularExpressions/CatToken.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/RegularExpressions/CatToken.cs @@ -0,0 +1,22 @@ +using System; + +namespace Implab.Automaton.RegularExpressions { + public class CatToken : BinaryToken { + public CatToken(Token left, Token right) + : base(left, right) { + } + + public override void Accept(IVisitor visitor) { + Safe.ArgumentNotNull(visitor, "visitor"); + visitor.Visit(this); + } + + public override string ToString() { + return String.Format("{0}{1}", FormatToken(Left), FormatToken(Right)); + } + + static string FormatToken(Token token) { + return String.Format(token is AltToken ? "({0})" : "{0}", token); + } + } +} diff --git a/Implab/Automaton/RegularExpressions/DFABuilder.cs b/Implab/Automaton/RegularExpressions/DFABuilder.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/RegularExpressions/DFABuilder.cs @@ -0,0 +1,181 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; + +namespace Implab.Automaton.RegularExpressions { + /// + /// Используется для построения ДКА по регулярному выражению, сначала обходит + /// регулярное выражение и вычисляет followpos, затем используется метод + /// для построения автомата. + /// + public class DFABuilder : IVisitor { + int m_idx = 0; + Token m_root; + HashSet m_firstpos; + HashSet m_lastpos; + + readonly Dictionary> m_followpos = new Dictionary>(); + readonly Dictionary m_indexes = new Dictionary(); + readonly Dictionary m_ends = new Dictionary(); + + public Dictionary> FollowposMap { + get { return m_followpos; } + } + + public HashSet Followpos(int pos) { + HashSet set; + if (m_followpos.TryGetValue(pos, out set)) + return set; + return m_followpos[pos] = new HashSet(); + } + + bool Nullable(object n) { + if (n is EmptyToken || n is StarToken) + return true; + if (n is AltToken) + return Nullable(((AltToken)n).Left) || Nullable(((AltToken)n).Right); + if (n is CatToken) + return Nullable(((CatToken)n).Left) && Nullable(((CatToken)n).Right); + return false; + } + + + public void Visit(AltToken token) { + if (m_root == null) + m_root = token; + var firtspos = new HashSet(); + var lastpos = new HashSet(); + + token.Left.Accept(this); + firtspos.UnionWith(m_firstpos); + lastpos.UnionWith(m_lastpos); + + token.Right.Accept(this); + firtspos.UnionWith(m_firstpos); + lastpos.UnionWith(m_lastpos); + + m_firstpos = firtspos; + m_lastpos = lastpos; + } + + public void Visit(StarToken token) { + if (m_root == null) + m_root = token; + token.Token.Accept(this); + + foreach (var i in m_lastpos) + Followpos(i).UnionWith(m_firstpos); + } + + public void Visit(CatToken token) { + if (m_root == null) + m_root = token; + + var firtspos = new HashSet(); + var lastpos = new HashSet(); + token.Left.Accept(this); + firtspos.UnionWith(m_firstpos); + var leftLastpos = m_lastpos; + + token.Right.Accept(this); + lastpos.UnionWith(m_lastpos); + var rightFirstpos = m_firstpos; + + if (Nullable(token.Left)) + firtspos.UnionWith(rightFirstpos); + + if (Nullable(token.Right)) + lastpos.UnionWith(leftLastpos); + + m_firstpos = firtspos; + m_lastpos = lastpos; + + foreach (var i in leftLastpos) + Followpos(i).UnionWith(rightFirstpos); + + } + + public void Visit(EmptyToken token) { + if (m_root == null) + m_root = token; + } + + public void Visit(SymbolToken token) { + if (m_root == null) + m_root = token; + m_idx++; + m_indexes[m_idx] = token.Value; + m_firstpos = new HashSet(new[] { m_idx }); + m_lastpos = new HashSet(new[] { m_idx }); + } + + public void Visit(EndToken token) { + if (m_root == null) + m_root = token; + m_idx++; + m_indexes[m_idx] = DFAConst.UNCLASSIFIED_INPUT; + m_firstpos = new HashSet(new[] { m_idx }); + m_lastpos = new HashSet(new[] { m_idx }); + Followpos(m_idx); + m_ends.Add(m_idx, token.Tag); + } + + public void BuildDFA(IDFADefinitionBuilder dfa, IAlphabetBuilder states) { + Safe.ArgumentNotNull(dfa,"dfa"); + + var stateMap = new Dictionary, int>(new CustomEqualityComparer>( + (x, y) => x.SetEquals(y), + x => x.Sum(n => n.GetHashCode()) + )); + + int nextState = 0; + + int initialState = states.DefineSymbol(nextState++); + stateMap[m_firstpos] = initialState; + + var tags = GetStateTags(m_firstpos); + if (tags != null && tags.Length > 0) + dfa.MarkFinalState(initialState, tags); + + var inputMax = m_indexes.Values.Max(); + var queue = new Queue>(); + + queue.Enqueue(m_firstpos); + + while (queue.Count > 0) { + var state = queue.Dequeue(); + var s1 = stateMap[state]; + + for (int a = 0; a <= inputMax; a++) { + var next = new HashSet(); + foreach (var p in state) { + if (m_indexes[p] == a) { + next.UnionWith(Followpos(p)); + } + } + if (next.Count > 0) { + int s2; + if (!stateMap.TryGetValue(next, out s2)) { + s2 = states.DefineSymbol(nextState++); + stateMap[next] = s2; + tags = GetStateTags(next); + if (tags != null && tags.Length > 0) + dfa.MarkFinalState(s2, tags); + + queue.Enqueue(next); + } + dfa.DefineTransition(s1, s2, a); + } + } + } + } + + TTag[] GetStateTags(IEnumerable state) { + Debug.Assert(state != null); + return state.Where(m_ends.ContainsKey).Select(pos => m_ends[pos]).ToArray(); + } + + } +} diff --git a/Implab/Automaton/RegularExpressions/EmptyToken.cs b/Implab/Automaton/RegularExpressions/EmptyToken.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/RegularExpressions/EmptyToken.cs @@ -0,0 +1,13 @@ +using Implab; + +namespace Implab.Automaton.RegularExpressions { + public class EmptyToken : Token { + public override void Accept(IVisitor visitor) { + Safe.ArgumentNotNull(visitor, "visitor"); + visitor.Visit(this); + } + public override string ToString() { + return "$"; + } + } +} diff --git a/Implab/Automaton/RegularExpressions/EndToken.cs b/Implab/Automaton/RegularExpressions/EndToken.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/RegularExpressions/EndToken.cs @@ -0,0 +1,32 @@ +using Implab; + +namespace Implab.Automaton.RegularExpressions { + /// + /// Конечный символ расширенного регулярного выражения, при построении ДКА + /// используется для определения конечных состояний. + /// + public class EndToken: Token { + + TTag m_tag; + + public EndToken(TTag tag) { + m_tag = tag; + } + + public EndToken() + : this(0) { + } + + public TTag Tag { + get { return m_tag; } + } + + public override void Accept(IVisitor visitor) { + Safe.ArgumentNotNull(visitor, "visitor"); + visitor.Visit(this); + } + public override string ToString() { + return "#"; + } + } +} diff --git a/Implab/Automaton/RegularExpressions/Grammar.cs b/Implab/Automaton/RegularExpressions/Grammar.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/RegularExpressions/Grammar.cs @@ -0,0 +1,108 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Implab.Automaton.RegularExpressions { + /// + /// Базовый абстрактный класс. Грамматика, позволяет формулировать выражения над алфавитом типа char. + /// + /// + public abstract class Grammar where TGrammar: Grammar, new() { + static TGrammar _instance; + + public static TGrammar Instance{ + get { + if (_instance == null) + _instance = new TGrammar(); + return _instance; + } + } + + readonly CharAlphabet m_alphabet = new CharAlphabet(); + + public CharAlphabet Alphabet { + get { return m_alphabet; } + } + + public SymbolToken UnclassifiedToken() { + return new SymbolToken(CharAlphabet.UNCLASSIFIED); + } + + public void DefineAlphabet(IEnumerable alphabet) { + Safe.ArgumentNotNull(alphabet, "alphabet"); + + foreach (var ch in alphabet) + m_alphabet.DefineSymbol(ch); + } + public Token SymbolRangeToken(char start, char end) { + return SymbolToken(Enumerable.Range(start, end - start + 1).Select(x => (char)x)); + } + + public Token SymbolToken(char symbol) { + return Token.New(TranslateOrAdd(symbol)); + } + + public Token SymbolToken(IEnumerable symbols) { + Safe.ArgumentNotNull(symbols, "symbols"); + + return Token.New(TranslateOrAdd(symbols).ToArray()); + } + + public Token SymbolSetToken(params char[] set) { + return SymbolToken(set); + } + + int TranslateOrAdd(char ch) { + var t = m_alphabet.Translate(ch); + if (t == CharAlphabet.UNCLASSIFIED) + t = m_alphabet.DefineSymbol(ch); + return t; + } + + IEnumerable TranslateOrAdd(IEnumerable symbols) { + return symbols.Distinct().Select(TranslateOrAdd); + } + + int TranslateOrDie(char ch) { + var t = m_alphabet.Translate(ch); + if (t == CharAlphabet.UNCLASSIFIED) + throw new ApplicationException(String.Format("Symbol '{0}' is UNCLASSIFIED", ch)); + return t; + } + + IEnumerable TranslateOrDie(IEnumerable symbols) { + return symbols.Distinct().Select(TranslateOrDie); + } + + public Token SymbolTokenExcept(IEnumerable symbols) { + Safe.ArgumentNotNull(symbols, "symbols"); + + return Token.New( Enumerable.Range(0, m_alphabet.Count).Except(TranslateOrDie(symbols)).ToArray()); + } + + protected CDFADefinition BuildDFA(Token lang) { + Safe.ArgumentNotNull(lang, "lang"); + + var dfa = new CDFADefinition(m_alphabet); + + var builder = new DFABuilder(); + + lang.Accept( builder ); + + builder.BuildDFA(dfa); + if (dfa.InitialStateIsFinal) + throw new ApplicationException("The specified language contains empty token"); + + return dfa.Optimize(); + } + + + + //protected abstract TGrammar CreateInstance(); + } + + +} diff --git a/Implab/Automaton/RegularExpressions/IVisitor.cs b/Implab/Automaton/RegularExpressions/IVisitor.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/RegularExpressions/IVisitor.cs @@ -0,0 +1,13 @@ +namespace Implab.Automaton.RegularExpressions { + /// + /// Интерфейс обходчика синтаксического дерева регулярного выражения + /// + public interface IVisitor { + void Visit(AltToken token); + void Visit(StarToken token); + void Visit(CatToken token); + void Visit(EmptyToken token); + void Visit(EndToken token); + void Visit(SymbolToken token); + } +} diff --git a/Implab/Automaton/RegularExpressions/StarToken.cs b/Implab/Automaton/RegularExpressions/StarToken.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/RegularExpressions/StarToken.cs @@ -0,0 +1,34 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Implab.Automaton.RegularExpressions { + /// + /// Замыкание выражения с 0 и более повторов. + /// + public class StarToken: Token { + + Token m_token; + + public Token Token { + get { return m_token; } + } + + public StarToken(Token token) { + Safe.ArgumentNotNull(token, "token"); + m_token = token; + } + + public override void Accept(IVisitor visitor) { + Safe.ArgumentNotNull(visitor, "visitor"); + visitor.Visit(this); + } + + public override string ToString() { + return String.Format("({0})*", Token); + } + } +} diff --git a/Implab/Automaton/RegularExpressions/SymbolToken.cs b/Implab/Automaton/RegularExpressions/SymbolToken.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/RegularExpressions/SymbolToken.cs @@ -0,0 +1,27 @@ +using Implab; + +namespace Implab.Automaton.RegularExpressions { + /// + /// Выражение, соответсвующее одному символу. + /// + public class SymbolToken : Token { + int m_value; + + public int Value { + get { return m_value; } + } + + public SymbolToken(int value) { + m_value = value; + } + public override void Accept(IVisitor visitor) { + Safe.ArgumentNotNull(visitor, "visitor"); + + visitor.Visit(this); + } + + public override string ToString() { + return Value.ToString(); + } + } +} diff --git a/Implab/Automaton/RegularExpressions/Token.cs b/Implab/Automaton/RegularExpressions/Token.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/RegularExpressions/Token.cs @@ -0,0 +1,63 @@ +using Implab; +using System; +using System.Linq; + +namespace Implab.Automaton.RegularExpressions { + public abstract class Token { + public abstract void Accept(IVisitor visitor); + + public Token Extend() { + return Cat(new EndToken()); + } + + public Token Tag(T tag) where T : IConvertible { + return Cat(new EndToken(tag)); + } + + public Token Cat(Token right) { + return new CatToken(this, right); + } + + public Token Or(Token right) { + return new AltToken(this, right); + } + + public Token Optional() { + return Or(new EmptyToken()); + } + + public Token EClosure() { + return new StarToken(this); + } + + public Token Closure() { + return Cat(new StarToken(this)); + } + + public Token Repeat(int count) { + Token token = null; + + for (int i = 0; i < count; i++) + token = token != null ? token.Cat(this) : this; + return token ?? new EmptyToken(); + } + + public Token Repeat(int min, int max) { + if (min > max || min < 1) + throw new ArgumentOutOfRangeException(); + var token = Repeat(min); + + for (int i = min; i < max; i++) + token = token.Cat( this.Optional() ); + return token; + } + + public static Token New(params T[] set) where T : struct, IConvertible { + Safe.ArgumentNotNull(set, "set"); + Token token = null; + foreach(var c in set.Distinct()) + token = token == null ? new SymbolToken(c) : token.Or(new SymbolToken(c)); + return token; + } + } +} diff --git a/Implab/Automaton/Scanner.cs b/Implab/Automaton/Scanner.cs new file mode 100644 --- /dev/null +++ b/Implab/Automaton/Scanner.cs @@ -0,0 +1,259 @@ +using Implab; +using System; +using System.Collections.Generic; +using System.IO; +using Implab.Components; + +namespace Implab.Automaton { + /// + /// Базовый класс для разбора потока входных символов на токены. + /// + /// + /// Сканнер имеет внутри буффер с симолами входного текста, по которому перемещаются два + /// указателя, начала и конца токена, при перемещении искользуется ДКА для определения + /// конца токена и допустимости текущего символа. + /// + public abstract class Scanner : Disposable { + struct ScannerConfig { + public DFAStateDescriptior[] states; + public int[] alphabetMap; + } + + Stack m_defs = new Stack(); + + DFAStateDescriptior[] m_states; + int[] m_alphabetMap; + + protected DFAStateDescriptior m_currentState; + int m_previewCode; + + protected int m_tokenLen = 0; + protected int m_tokenOffset; + + protected char[] m_buffer; + protected int m_bufferSize; + protected int m_pointer; + + TextReader m_reader; + bool m_disposeReader; + int m_chunkSize = 1024; // 1k + int m_limit = 10 * 1024 * 1024; // 10Mb + + protected Scanner(DFAStateDescriptior[] states, int[] alphabet) { + Safe.ArgumentNotEmpty(states, "states"); + Safe.ArgumentNotNull(alphabet, "alphabet"); + + m_states = states; + m_alphabetMap = alphabet; + + Feed(new char[0]); + } + + /// + /// Заполняет входными данными буффер. + /// + /// Данные для обработки. + /// Копирование данных не происходит, переданный массив используется в + /// качестве входного буффера. + public void Feed(char[] data) { + Safe.ArgumentNotNull(data, "data"); + + Feed(data, data.Length); + } + + /// + /// Заполняет буффур чтения входными данными. + /// + /// Данные для обработки. + /// Длина данных для обработки. + /// Копирование данных не происходит, переданный массив используется в + /// качестве входного буффера. + public void Feed(char[] data, int length) { + Safe.ArgumentNotNull(data, "data"); + Safe.ArgumentInRange(length, 0, data.Length, "length"); + AssertNotDisposed(); + + m_pointer = -1; + m_buffer = data; + m_bufferSize = length; + Shift(); + } + + public void Feed(TextReader reader, bool dispose) { + Safe.ArgumentNotNull(reader, "reader"); + AssertNotDisposed(); + + if (m_reader != null && m_disposeReader) + m_reader.Dispose(); + + m_reader = reader; + m_disposeReader = dispose; + m_pointer = -1; + m_buffer = new char[m_chunkSize]; + m_bufferSize = 0; + Shift(); + } + + /// + /// Получает текущий токен в виде строки. + /// + /// + protected string GetTokenValue() { + return new String(m_buffer, m_tokenOffset, m_tokenLen); + } + + /// + /// Метки текущего токена, которые были назначены в регулярном выражении. + /// + protected TTag[] TokenTags { + get { + return m_currentState.tag; + } + } + + /// + /// Признак конца данных + /// + public bool EOF { + get { + return m_pointer >= m_bufferSize; + } + } + + /// + /// Читает следующий токен, при этом указывает на начало токена, + /// на длину токена, - массив символов, в + /// котором находится токен. + /// + /// false - достигнут конец данных, токен не прочитан. + protected bool ReadTokenInternal() { + if (m_pointer >= m_bufferSize) + return false; + + m_currentState = m_states[DFADefinition.INITIAL_STATE]; + m_tokenLen = 0; + m_tokenOffset = m_pointer; + int nextState; + do { + nextState = m_currentState.transitions[m_previewCode]; + if (nextState == DFAConst.UNREACHABLE_STATE) { + if (m_currentState.final) + return true; + else + throw new ParserException( + String.Format( + "Unexpected symbol '{0}', at pos {1}", + m_buffer[m_pointer], + Position + ) + ); + } else { + m_currentState = m_states[nextState]; + m_tokenLen++; + } + + } while (Shift()); + + // END OF DATA + if (!m_currentState.final) + throw new ParserException("Unexpected end of data"); + + return true; + } + + + bool Shift() { + m_pointer++; + + if (m_pointer >= m_bufferSize) { + if (!ReadNextChunk()) + return false; + } + + m_previewCode = m_alphabetMap[m_buffer[m_pointer]]; + + return true; + } + + bool ReadNextChunk() { + if (m_reader == null) + return false; + + // extend buffer if nesessary + if (m_pointer + m_chunkSize > m_buffer.Length) { + // trim unused buffer head + var size = m_tokenLen + m_chunkSize; + if (size >= m_limit) + throw new ParserException(String.Format("Input buffer {0} bytes limit exceeded", m_limit)); + var temp = new char[size]; + Array.Copy(m_buffer, m_tokenOffset, temp, 0, m_tokenLen); + m_pointer -= m_tokenOffset; + m_bufferSize -= m_tokenOffset; + m_tokenOffset = 0; + m_buffer = temp; + } + + var read = m_reader.Read(m_buffer, m_tokenLen, m_chunkSize); + if (read == 0) + return false; + + m_bufferSize += read; + + return true; + } + + /// + /// Позиция сканнера во входном буфере + /// + public int Position { + get { + return m_pointer + 1; + } + } + + /// + /// Преключает внутренний ДКА на указанный, позволяет реализовать подобие захватывающей + /// группировки. + /// + /// Таблица состояний нового ДКА + /// Таблица входных символов для нового ДКА + protected void Switch(DFAStateDescriptior[] states, int[] alphabet) { + Safe.ArgumentNotNull(states, "dfa"); + + m_defs.Push(new ScannerConfig { + states = m_states, + alphabetMap = m_alphabetMap + }); + + m_states = states; + m_alphabetMap = alphabet; + + m_previewCode = m_alphabetMap[m_buffer[m_pointer]]; + } + + /// + /// Восстанавливает предыдущей ДКА сканнера. + /// + protected void Restore() { + if (m_defs.Count == 0) + throw new InvalidOperationException(); + var prev = m_defs.Pop(); + m_states = prev.states; + m_alphabetMap = prev.alphabetMap; + m_previewCode = m_alphabetMap[m_buffer[m_pointer]]; + } + + protected override void Dispose(bool disposing) { + if (disposing) { + if (m_reader != null && m_disposeReader) + m_reader.Dispose(); + m_buffer = null; + m_bufferSize = 0; + m_pointer = 0; + m_tokenLen = 0; + m_tokenOffset = 0; + } + base.Dispose(disposing); + } + } +} diff --git a/Implab/Implab.csproj b/Implab/Implab.csproj --- a/Implab/Implab.csproj +++ b/Implab/Implab.csproj @@ -102,26 +102,6 @@ - - - - - - - - - - - - - - - - - - - - @@ -178,11 +158,34 @@ - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -257,5 +260,6 @@ + \ No newline at end of file diff --git a/Implab/Parsing/AltToken.cs b/Implab/Parsing/AltToken.cs deleted file mode 100644 --- a/Implab/Parsing/AltToken.cs +++ /dev/null @@ -1,17 +0,0 @@ -using System; - -namespace Implab.Parsing { - public class AltToken: BinaryToken { - public AltToken(Token left, Token right) - : base(left, right) { - } - - public override void Accept(IVisitor visitor) { - Safe.ArgumentNotNull(visitor, "visitor"); - visitor.Visit(this); - } - public override string ToString() { - return String.Format(Right is BinaryToken ? "{0}|({1})" : "{0}|{1}", Left, Right); - } - } -} diff --git a/Implab/Parsing/BinaryToken.cs b/Implab/Parsing/BinaryToken.cs deleted file mode 100644 --- a/Implab/Parsing/BinaryToken.cs +++ /dev/null @@ -1,26 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - public abstract class BinaryToken : Token { - Token m_left; - Token m_right; - - public Token Left { - get { return m_left; } - } - - public Token Right { - get { return m_right; } - } - - protected BinaryToken(Token left, Token right) { - Safe.ArgumentNotNull(m_left = left, "left"); - Safe.ArgumentNotNull(m_right = right, "right"); - } - } -} diff --git a/Implab/Parsing/CDFADefinition.cs b/Implab/Parsing/CDFADefinition.cs deleted file mode 100644 --- a/Implab/Parsing/CDFADefinition.cs +++ /dev/null @@ -1,22 +0,0 @@ -namespace Implab.Parsing { - public class CDFADefinition : DFADefinition { - readonly CharAlphabet m_alphabet; - - public CharAlphabet Alphabet { - get { return m_alphabet; } - } - - public CDFADefinition(CharAlphabet alphabet): base(alphabet.Count) { - m_alphabet = alphabet; - } - - public CDFADefinition Optimize() { - - return (CDFADefinition)Optimize(alphabet => new CDFADefinition((CharAlphabet)alphabet), m_alphabet, new CharAlphabet()); - } - - public void PrintDFA() { - PrintDFA(m_alphabet); - } - } -} diff --git a/Implab/Parsing/CatToken.cs b/Implab/Parsing/CatToken.cs deleted file mode 100644 --- a/Implab/Parsing/CatToken.cs +++ /dev/null @@ -1,22 +0,0 @@ -using System; - -namespace Implab.Parsing { - public class CatToken : BinaryToken { - public CatToken(Token left, Token right) - : base(left, right) { - } - - public override void Accept(IVisitor visitor) { - Safe.ArgumentNotNull(visitor, "visitor"); - visitor.Visit(this); - } - - public override string ToString() { - return String.Format("{0}{1}", FormatToken(Left), FormatToken(Right)); - } - - string FormatToken(Token token) { - return String.Format(token is AltToken ? "({0})" : "{0}", token); - } - } -} diff --git a/Implab/Parsing/CharAlphabet.cs b/Implab/Parsing/CharAlphabet.cs deleted file mode 100644 --- a/Implab/Parsing/CharAlphabet.cs +++ /dev/null @@ -1,23 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - public class CharAlphabet: IndexedAlphabetBase { - - public CharAlphabet() - : base(char.MaxValue + 1) { - } - - public override int GetSymbolIndex(char symbol) { - return symbol; - } - - public override IEnumerable InputSymbols { - get { return Enumerable.Range(char.MinValue, char.MaxValue).Select(x => (char)x); } - } - } -} diff --git a/Implab/Parsing/DFABuilder.cs b/Implab/Parsing/DFABuilder.cs deleted file mode 100644 --- a/Implab/Parsing/DFABuilder.cs +++ /dev/null @@ -1,180 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Linq; - -namespace Implab.Parsing { - /// - /// Используется для построения ДКА по регулярному выражению, сначала обходит - /// регулярное выражение и вычисляет followpos, затем используется метод - /// для построения автомата. - /// - public class DFABuilder : IVisitor { - int m_idx = 0; - Token m_root; - HashSet m_firstpos; - HashSet m_lastpos; - - Dictionary> m_followpos = new Dictionary>(); - Dictionary m_indexes = new Dictionary(); - Dictionary m_ends = new Dictionary(); - - public Dictionary> FollowposMap { - get { return m_followpos; } - } - - public HashSet Followpos(int pos) { - HashSet set; - if (m_followpos.TryGetValue(pos, out set)) - return set; - return m_followpos[pos] = new HashSet(); - } - - bool Nullable(object n) { - if (n is EmptyToken || n is StarToken) - return true; - if (n is AltToken) - return Nullable(((AltToken)n).Left) || Nullable(((AltToken)n).Right); - if (n is CatToken) - return Nullable(((CatToken)n).Left) && Nullable(((CatToken)n).Right); - return false; - } - - - public void Visit(AltToken token) { - if (m_root == null) - m_root = token; - var firtspos = new HashSet(); - var lastpos = new HashSet(); - - token.Left.Accept(this); - firtspos.UnionWith(m_firstpos); - lastpos.UnionWith(m_lastpos); - - token.Right.Accept(this); - firtspos.UnionWith(m_firstpos); - lastpos.UnionWith(m_lastpos); - - m_firstpos = firtspos; - m_lastpos = lastpos; - } - - public void Visit(StarToken token) { - if (m_root == null) - m_root = token; - token.Token.Accept(this); - - foreach (var i in m_lastpos) - Followpos(i).UnionWith(m_firstpos); - } - - public void Visit(CatToken token) { - if (m_root == null) - m_root = token; - - var firtspos = new HashSet(); - var lastpos = new HashSet(); - token.Left.Accept(this); - firtspos.UnionWith(m_firstpos); - var leftLastpos = m_lastpos; - - token.Right.Accept(this); - lastpos.UnionWith(m_lastpos); - var rightFirstpos = m_firstpos; - - if (Nullable(token.Left)) - firtspos.UnionWith(rightFirstpos); - - if (Nullable(token.Right)) - lastpos.UnionWith(leftLastpos); - - m_firstpos = firtspos; - m_lastpos = lastpos; - - foreach (var i in leftLastpos) - Followpos(i).UnionWith(rightFirstpos); - - } - - public void Visit(EmptyToken token) { - if (m_root == null) - m_root = token; - ; - } - - public void Visit(SymbolToken token) { - if (m_root == null) - m_root = token; - m_idx++; - m_indexes[m_idx] = token.Value; - m_firstpos = new HashSet(new[] { m_idx }); - m_lastpos = new HashSet(new[] { m_idx }); - } - - public void Visit(EndToken token) { - if (m_root == null) - m_root = token; - m_idx++; - m_indexes[m_idx] = IndexedAlphabetBase.UNCLASSIFIED; - m_firstpos = new HashSet(new[] { m_idx }); - m_lastpos = new HashSet(new[] { m_idx }); - Followpos(m_idx); - m_ends.Add(m_idx, token.Tag); - } - - public void BuildDFA(IDFADefinition dfa) { - Safe.ArgumentNotNull(dfa,"dfa"); - - var stateMap = new Dictionary, int>(new CustomEqualityComparer>( - (x, y) => x.SetEquals(y), - (x) => x.Sum(n => n.GetHashCode()) - )); - - stateMap[m_firstpos] = DefineState( dfa, m_firstpos); - Debug.Assert(stateMap[m_firstpos] == DFADefinition.INITIAL_STATE); - - var queue = new Queue>(); - - queue.Enqueue(m_firstpos); - - while (queue.Count > 0) { - var state = queue.Dequeue(); - var s1 = stateMap[state]; - - for (int a = 0; a < dfa.AlphabetSize; a++) { - var next = new HashSet(); - foreach (var p in state) { - if (m_indexes[p] == a) { - next.UnionWith(Followpos(p)); - } - } - if (next.Count > 0) { - int s2; - if (!stateMap.TryGetValue(next, out s2)) { - stateMap[next] = s2 = DefineState(dfa, next); - queue.Enqueue(next); - } - dfa.DefineTransition(s1, s2, a); - } - } - - } - } - - int[] GetStateTags(HashSet state) { - Debug.Assert(state != null); - return state.Where(m_ends.ContainsKey).Select(pos => m_ends[pos]).ToArray(); - } - - int DefineState(IDFADefinition automa, HashSet state) { - Debug.Assert(automa != null); - Debug.Assert(state != null); - - var tags = GetStateTags(state); - - return tags.Length > 0 ? automa.AddState(tags) : automa.AddState(); - } - - } -} diff --git a/Implab/Parsing/DFADefinition.cs b/Implab/Parsing/DFADefinition.cs deleted file mode 100644 --- a/Implab/Parsing/DFADefinition.cs +++ /dev/null @@ -1,285 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Linq; - -namespace Implab.Parsing { - public class DFADefinition : IDFADefinition { - readonly List> m_states; - - public const int INITIAL_STATE = 1; - public const int UNREACHEBLE_STATE = 0; - - DFAStateDescriptior[] m_statesArray; - readonly int m_alpabetSize; - - public DFADefinition(int alphabetSize) { - m_states = new List>(); - m_alpabetSize = alphabetSize; - - m_states.Add(new DFAStateDescriptior()); - } - - public bool InitialStateIsFinal { - get { - return m_states[INITIAL_STATE].final; - } - } - - public int AddState() { - var index = m_states.Count; - m_states.Add(new DFAStateDescriptior { - final = false, - transitions = new int[AlphabetSize] - }); - m_statesArray = null; - - return index; - } - - public int AddState(TTag[] tag) { - var index = m_states.Count; - bool final = tag != null && tag.Length != 0; - m_states.Add(new DFAStateDescriptior { - final = final, - transitions = new int[AlphabetSize], - tag = final ? tag : null - }); - m_statesArray = null; - return index; - } - - public void DefineTransition(TState s1, TState s2, TInput symbol) { - int is1 = StateAlphabet.Translate(s1); - int is2 = StateAlphabet.Translate(s2); - int isym = InputAlphabet.Translate(symbol); - - Safe.ArgumentAssert(is1 != 0, "s1"); - Safe.ArgumentAssert(is2 != 0, "s2"); - Safe.ArgumentAssert(isym != 0, "symbol"); - - m_states[is1].transitions[isym] = is2; - } - - #region IDFADefinition implementation - - public DFAStateDescriptior[] GetTransitionTable() { - if (m_statesArray == null) - m_statesArray = m_states.ToArray(); - return m_statesArray; - } - - public IAlphabet InputAlphabet { - get { - throw new NotImplementedException(); - } - } - - public IAlphabet StateAlphabet { - get { - throw new NotImplementedException(); - } - } - - #endregion - - protected IDFADefinition<> Optimize(Func, IDFADefinition> dfaFactory,IAlphabet sourceAlphabet, IAlphabet minimalAlphabet) { - Safe.ArgumentNotNull(dfaFactory, "dfaFactory"); - Safe.ArgumentNotNull(minimalAlphabet, "minimalAlphabet"); - - var setComparer = new CustomEqualityComparer>( - (x, y) => x.SetEquals(y), - (s) => s.Sum(x => x.GetHashCode()) - ); - - var arrayComparer = new CustomEqualityComparer( - (x,y) => (new HashSet(x)).SetEquals(new HashSet(y)), - (a) => a.Sum(x => x.GetHashCode()) - ); - - var optimalStates = new HashSet>(setComparer); - var queue = new HashSet>(setComparer); - - foreach (var g in Enumerable - .Range(INITIAL_STATE, m_states.Count-1) - .Select(i => new { - index = i, - descriptor = m_states[i] - }) - .Where(x => x.descriptor.final) - .GroupBy(x => x.descriptor.tag, arrayComparer) - ) { - optimalStates.Add(new HashSet(g.Select(x => x.index))); - } - - var state = new HashSet( - Enumerable - .Range(INITIAL_STATE, m_states.Count - 1) - .Where(i => !m_states[i].final) - ); - optimalStates.Add(state); - queue.Add(state); - - while (queue.Count > 0) { - var stateA = queue.First(); - queue.Remove(stateA); - - for (int c = 0; c < AlphabetSize; c++) { - var stateX = new HashSet(); - - for(int s = 1; s < m_states.Count; s++) { - if (stateA.Contains(m_states[s].transitions[c])) - stateX.Add(s); - } - - foreach (var stateY in optimalStates.ToArray()) { - if (stateX.Overlaps(stateY) && !stateY.IsSubsetOf(stateX)) { - var stateR1 = new HashSet(stateY); - var stateR2 = new HashSet(stateY); - - stateR1.IntersectWith(stateX); - stateR2.ExceptWith(stateX); - - optimalStates.Remove(stateY); - optimalStates.Add(stateR1); - optimalStates.Add(stateR2); - - if (queue.Contains(stateY)) { - queue.Remove(stateY); - queue.Add(stateR1); - queue.Add(stateR2); - } else { - queue.Add(stateR1.Count <= stateR2.Count ? stateR1 : stateR2); - } - } - } - } - } - - // строим карты соотвествия оптимальных состояний с оригинальными - - var initialState = optimalStates.Single(x => x.Contains(INITIAL_STATE)); - - // карта получения оптимального состояния по соотвествующему ему простому состоянию - int[] reveseOptimalMap = new int[m_states.Count]; - // карта с индексами оптимальных состояний - HashSet[] optimalMap = new HashSet[optimalStates.Count + 1]; - { - optimalMap[0] = new HashSet(); // unreachable state - optimalMap[1] = initialState; // initial state - foreach (var ss in initialState) - reveseOptimalMap[ss] = 1; - - int i = 2; - foreach (var s in optimalStates) { - if (s.SetEquals(initialState)) - continue; - optimalMap[i] = s; - foreach (var ss in s) - reveseOptimalMap[ss] = i; - i++; - } - } - - // получаем минимальный алфавит - - var minClasses = new HashSet>(setComparer); - var alphaQueue = new Queue>(); - alphaQueue.Enqueue(new HashSet(Enumerable.Range(0,AlphabetSize))); - - for (int s = 1 ; s < optimalMap.Length; s++) { - var newQueue = new Queue>(); - - foreach (var A in alphaQueue) { - if (A.Count == 1) { - minClasses.Add(A); - continue; - } - - // различаем классы символов, которые переводят в различные оптимальные состояния - // optimalState -> alphaClass - var classes = new Dictionary>(); - - foreach (var term in A) { - // ищем все переходы класса по символу term - var s2 = reveseOptimalMap[ - optimalMap[s].Select(x => m_states[x].transitions[term]).FirstOrDefault(x => x != 0) // первое допустимое элементарное состояние, если есть - ]; - - HashSet A2; - if (!classes.TryGetValue(s2, out A2)) { - A2 = new HashSet(); - newQueue.Enqueue(A2); - classes[s2] = A2; - } - A2.Add(term); - } - } - - if (newQueue.Count == 0) - break; - alphaQueue = newQueue; - } - - foreach (var A in alphaQueue) - minClasses.Add(A); - - var alphabetMap = sourceAlphabet.Reclassify(minimalAlphabet, minClasses); - - // построение автомата - - var minimalDFA = dfaFactory(minimalAlphabet); - - var states = new int[ optimalMap.Length ]; - states[0] = UNREACHEBLE_STATE; - - for(var s = INITIAL_STATE; s < states.Length; s++) { - var tags = optimalMap[s].SelectMany(x => m_states[x].tag ?? Enumerable.Empty()).Distinct().ToArray(); - if (tags.Length > 0) - states[s] = minimalDFA.AddState(tags); - else - states[s] = minimalDFA.AddState(); - } - - Debug.Assert(states[INITIAL_STATE] == INITIAL_STATE); - - for (int s1 = 1; s1 < m_states.Count; s1++) { - for (int c = 0; c < AlphabetSize; c++) { - var s2 = m_states[s1].transitions[c]; - if (s2 != UNREACHEBLE_STATE) { - minimalDFA.DefineTransition( - reveseOptimalMap[s1], - reveseOptimalMap[s2], - alphabetMap[c] - ); - } - } - } - - return minimalDFA; - } - - public void PrintDFA(IAlphabet alphabet) { - - var reverseMap = alphabet.CreateReverseMap(); - - for (int i = 1; i < reverseMap.Length; i++) { - Console.WriteLine("C{0}: {1}", i, String.Join(",", reverseMap[i])); - } - - for (int i = 1; i < m_states.Count; i++) { - var s = m_states[i]; - for (int c = 0; c < AlphabetSize; c++) - if (s.transitions[c] != UNREACHEBLE_STATE) - Console.WriteLine("S{0} -{1}-> S{2}{3}", i, String.Join(",", reverseMap[c]), s.transitions[c], m_states[s.transitions[c]].final ? "$" : ""); - } - } - - public int AlphabetSize { - get { - return m_alpabetSize; - } - } - } -} diff --git a/Implab/Parsing/DFAStateDescriptor.cs b/Implab/Parsing/DFAStateDescriptor.cs deleted file mode 100644 --- a/Implab/Parsing/DFAStateDescriptor.cs +++ /dev/null @@ -1,13 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - public struct DFAStateDescriptior { - public bool final; - public TTag[] tag; - public int[] transitions; - } -} diff --git a/Implab/Parsing/DFAutomaton.cs b/Implab/Parsing/DFAutomaton.cs deleted file mode 100644 --- a/Implab/Parsing/DFAutomaton.cs +++ /dev/null @@ -1,61 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - public abstract class DFAutomaton { - protected struct ContextFrame { - public DFAStateDescriptior[] states; - public int current; - public T info; - } - - public const int INITIAL_STATE = DFADefinition.INITIAL_STATE; - public const int UNREACHEBLE_STATE = DFADefinition.UNREACHEBLE_STATE; - - protected ContextFrame m_context; - Stack m_contextStack = new Stack(); - - protected int Level { - get { return m_contextStack.Count; } - } - - protected DFAutomaton(DFAStateDescriptior[] states, int startState, T info) { - Safe.ArgumentNotNull(states, "states"); - Safe.ArgumentInRange(startState, 0, states.Length - 1, "startState"); - - m_context.states = states; - m_context.current = startState; - m_context.info = info; - } - - protected void Switch(DFAStateDescriptior[] states, int current, T info) { - Debug.Assert(states != null); - Debug.Assert(current >= 0 && current < states.Length); - m_contextStack.Push(m_context); - m_context.states = states; - m_context.current = current; - m_context.info = info; - } - - protected void Restore() { - Debug.Assert(m_contextStack.Count > 0); - - m_context = m_contextStack.Pop(); - } - - protected void Move(int input) { - Debug.Assert(input > 0 && input < m_context.states[m_context.current].transitions.Length); - m_context.current = m_context.states[m_context.current].transitions[input]; - } - - protected bool CanMove(int input) { - Debug.Assert(input > 0 && input < m_context.states[m_context.current].transitions.Length); - return m_context.states[m_context.current].transitions[input] != UNREACHEBLE_STATE; - } - } -} diff --git a/Implab/Parsing/EDFADefinition.cs b/Implab/Parsing/EDFADefinition.cs deleted file mode 100644 --- a/Implab/Parsing/EDFADefinition.cs +++ /dev/null @@ -1,29 +0,0 @@ -using Implab; -using System; - -namespace Implab.Parsing { - public class EDFADefinition : DFADefinition where T : struct, IConvertible { - readonly EnumAlphabet m_alphabet; - - public EnumAlphabet Alphabet { - get { return m_alphabet; } - } - - public EDFADefinition(EnumAlphabet alphabet) : base(alphabet.Count) { - m_alphabet = alphabet; - } - - public void DefineTransition(int s1, int s2, T input) { - DefineTransition(s1, s2, m_alphabet.Translate(input)); - } - - public EDFADefinition Optimize() { - - return (EDFADefinition)Optimize(alphabet => new EDFADefinition((EnumAlphabet)alphabet), m_alphabet, new EnumAlphabet()); - } - - public void PrintDFA() { - PrintDFA(m_alphabet); - } - } -} diff --git a/Implab/Parsing/EmptyToken.cs b/Implab/Parsing/EmptyToken.cs deleted file mode 100644 --- a/Implab/Parsing/EmptyToken.cs +++ /dev/null @@ -1,18 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - public class EmptyToken : Token { - public override void Accept(IVisitor visitor) { - Safe.ArgumentNotNull(visitor, "visitor"); - visitor.Visit(this); - } - public override string ToString() { - return "$"; - } - } -} diff --git a/Implab/Parsing/EndToken.cs b/Implab/Parsing/EndToken.cs deleted file mode 100644 --- a/Implab/Parsing/EndToken.cs +++ /dev/null @@ -1,37 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// - /// Конечный символ расширенного регулярного выражения, при построении ДКА - /// используется для определения конечных состояний. - /// - public class EndToken: Token { - - int m_tag; - - public EndToken(int tag) { - m_tag = tag; - } - - public EndToken() - : this(0) { - } - - public int Tag { - get { return m_tag; } - } - - public override void Accept(IVisitor visitor) { - Safe.ArgumentNotNull(visitor, "visitor"); - visitor.Visit(this); - } - public override string ToString() { - return "#"; - } - } -} diff --git a/Implab/Parsing/EnumAlphabet.cs b/Implab/Parsing/EnumAlphabet.cs deleted file mode 100644 --- a/Implab/Parsing/EnumAlphabet.cs +++ /dev/null @@ -1,67 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Globalization; -using System.Linq; -using System.Diagnostics.CodeAnalysis; - -namespace Implab.Parsing { - /// - /// Алфавит символами которого являются элементы перечислений. - /// - /// Тип перечислений - public class EnumAlphabet : IndexedAlphabetBase where T : struct, IConvertible { - [SuppressMessage("Microsoft.Design", "CA1000:DoNotDeclareStaticMembersOnGenericTypes")] - static readonly T[] _symbols; - static readonly EnumAlphabet _fullAlphabet; - - [SuppressMessage("Microsoft.Design", "CA1065:DoNotRaiseExceptionsInUnexpectedLocations")] - static EnumAlphabet() { - if (!typeof(T).IsEnum) - throw new InvalidOperationException("Invalid generic parameter, enumeration is required"); - - if (Enum.GetUnderlyingType(typeof(T)) != typeof(Int32)) - throw new InvalidOperationException("Only enums based on Int32 are supported"); - - _symbols = ((T[])Enum.GetValues(typeof(T))) - .OrderBy(x => x.ToInt32(CultureInfo.InvariantCulture)) - .ToArray(); - - if ( - _symbols[_symbols.Length - 1].ToInt32(CultureInfo.InvariantCulture) >= _symbols.Length - || _symbols[0].ToInt32(CultureInfo.InvariantCulture) != 0 - ) - throw new InvalidOperationException("The specified enumeration must be zero-based and continuously numbered"); - - _fullAlphabet = new EnumAlphabet(_symbols.Select(x => x.ToInt32(CultureInfo.InvariantCulture)).ToArray()); - } - - - - public static EnumAlphabet FullAlphabet { - get { - return _fullAlphabet; - } - } - - - public EnumAlphabet() - : base(_symbols.Length) { - } - - public EnumAlphabet(int[] map) - : base(map) { - Debug.Assert(map.Length == _symbols.Length); - } - - - public override int GetSymbolIndex(T symbol) { - return symbol.ToInt32(CultureInfo.InvariantCulture); - } - - public override IEnumerable InputSymbols { - get { return _symbols; } - } - - } -} diff --git a/Implab/Parsing/Grammar.cs b/Implab/Parsing/Grammar.cs deleted file mode 100644 --- a/Implab/Parsing/Grammar.cs +++ /dev/null @@ -1,108 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// - /// Базовый абстрактный класс. Грамматика, позволяет формулировать выражения над алфавитом типа char. - /// - /// - public abstract class Grammar where TGrammar: Grammar, new() { - static TGrammar _instance; - - public static TGrammar Instance{ - get { - if (_instance == null) - _instance = new TGrammar(); - return _instance; - } - } - - readonly CharAlphabet m_alphabet = new CharAlphabet(); - - public CharAlphabet Alphabet { - get { return m_alphabet; } - } - - public SymbolToken UnclassifiedToken() { - return new SymbolToken(CharAlphabet.UNCLASSIFIED); - } - - public void DefineAlphabet(IEnumerable alphabet) { - Safe.ArgumentNotNull(alphabet, "alphabet"); - - foreach (var ch in alphabet) - m_alphabet.DefineSymbol(ch); - } - public Token SymbolRangeToken(char start, char end) { - return SymbolToken(Enumerable.Range(start, end - start + 1).Select(x => (char)x)); - } - - public Token SymbolToken(char symbol) { - return Token.New(TranslateOrAdd(symbol)); - } - - public Token SymbolToken(IEnumerable symbols) { - Safe.ArgumentNotNull(symbols, "symbols"); - - return Token.New(TranslateOrAdd(symbols).ToArray()); - } - - public Token SymbolSetToken(params char[] set) { - return SymbolToken(set); - } - - int TranslateOrAdd(char ch) { - var t = m_alphabet.Translate(ch); - if (t == CharAlphabet.UNCLASSIFIED) - t = m_alphabet.DefineSymbol(ch); - return t; - } - - IEnumerable TranslateOrAdd(IEnumerable symbols) { - return symbols.Distinct().Select(TranslateOrAdd); - } - - int TranslateOrDie(char ch) { - var t = m_alphabet.Translate(ch); - if (t == CharAlphabet.UNCLASSIFIED) - throw new ApplicationException(String.Format("Symbol '{0}' is UNCLASSIFIED", ch)); - return t; - } - - IEnumerable TranslateOrDie(IEnumerable symbols) { - return symbols.Distinct().Select(TranslateOrDie); - } - - public Token SymbolTokenExcept(IEnumerable symbols) { - Safe.ArgumentNotNull(symbols, "symbols"); - - return Token.New( Enumerable.Range(0, m_alphabet.Count).Except(TranslateOrDie(symbols)).ToArray()); - } - - protected CDFADefinition BuildDFA(Token lang) { - Safe.ArgumentNotNull(lang, "lang"); - - var dfa = new CDFADefinition(m_alphabet); - - var builder = new DFABuilder(); - - lang.Accept( builder ); - - builder.BuildDFA(dfa); - if (dfa.InitialStateIsFinal) - throw new ApplicationException("The specified language contains empty token"); - - return dfa.Optimize(); - } - - - - //protected abstract TGrammar CreateInstance(); - } - - -} diff --git a/Implab/Parsing/IAlphabet.cs b/Implab/Parsing/IAlphabet.cs deleted file mode 100644 --- a/Implab/Parsing/IAlphabet.cs +++ /dev/null @@ -1,48 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// - /// Алфавит. Множество символов, которые разбиты на классы, при этом классы имеют непрерывную нумерацию, - /// что позволяет использовать их в качестве индексов массивов. - /// - /// - /// Алфавит является сюрьективным отображением множества символов в множество индексов, это позволяет сократить размер таблицы переходов автомата - /// для входных символов, которые для него не различимы. - /// - /// Тип символов. - public interface IAlphabet { - /// - /// Количество классов символов в алфавите. - /// - int Count { get; } - - /// - /// Создает карту обратного сопоставления класса символов алфавита и сопоставленным - /// ему исходным символам. - /// - /// - List[] CreateReverseMap(); - - /// - /// Создает новый алфавит на основе текущего, горппируя его сиволы в более - /// крупные непересекающиеся классы символов. - /// - /// Новый, пустой алфавит, в котором быдут определены классы. - /// Множество классов символов текущего алфавита. - /// Карта для перехода классов текущего - /// алфавита к классам нового. - /// Ползволяет укрупнить алфавит, объединив классы в текущем алфавите. Используется при оптимизации автомата. - int[] Reclassify(IAlphabetBuilder newAlphabet, IEnumerable> classes); - - /// - /// Преобразует входной символ в индекс символа из алфавита. - /// - /// Исходный символ - /// Индекс в алфавите - int Translate(TSymbol symobl); - } -} diff --git a/Implab/Parsing/IAlphabetBuilder.cs b/Implab/Parsing/IAlphabetBuilder.cs deleted file mode 100644 --- a/Implab/Parsing/IAlphabetBuilder.cs +++ /dev/null @@ -1,25 +0,0 @@ -using System; -using System.Collections.Generic; - -namespace Implab.Parsing { - public interface IAlphabetBuilder : IAlphabet { - /// - /// Добавляет новый символ в алфавит, если символ уже был добавлен, то - /// возвращается ранее сопоставленный с символом класс. - /// - /// Символ для добавления. - /// Индекс класса, который попоставлен с символом. - int DefineSymbol(TSymbol symbol); - /// - /// Доабвляем класс символов. Множеству указанных исходных символов - /// будет сопоставлен символ в алфавите. - /// - /// Множестов исходных символов - /// Идентификатор символа алфавита. - int DefineClass(IEnumerable symbols); - - - - } -} - diff --git a/Implab/Parsing/IDFADefinition.cs b/Implab/Parsing/IDFADefinition.cs deleted file mode 100644 --- a/Implab/Parsing/IDFADefinition.cs +++ /dev/null @@ -1,61 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// - /// Полностью описывает DFA автомат, его поведение, состояние и входные символы. - /// - /// - /// class MyAutomaton { - /// int m_current; - /// readonly DFAStateDescriptor[] m_automaton; - /// readonly IAlphabet m_commands; - /// - /// public MyAutomaton(IDFADefinition<MyCommands,MyStates,string> definition) { - /// m_current = definition.StateAlphabet.Translate(MyStates.Initial); - /// m_automaton = definition.GetTransitionTable(); - /// m_commands = definition.InputAlphabet; - /// } - /// - /// // defined a method which will move the automaton to the next state - /// public void Move(MyCommands cmd) { - /// // use transition map to determine the next state - /// var next = m_automaton[m_current].transitions[m_commands.Translate(cmd)]; - /// - /// // validate that we aren't in the unreachable state - /// if (next == DFAConst.UNREACHABLE_STATE) - /// throw new InvalidOperationException("The specified command is invalid"); - /// - /// // if everything is ok - /// m_current = next; - /// } - /// } - /// - public interface IDFADefinition { - /// - /// Алфавит входных символов - /// - /// The input alphabet. - IAlphabet InputAlphabet { - get; - } - - /// - /// Алфавит состояний автомата - /// - /// The state alphabet. - IAlphabet StateAlphabet { - get; - } - - /// - /// Таблица переходов состояний автомата - /// - /// The transition table. - DFAStateDescriptior[] GetTransitionTable(); - - } -} diff --git a/Implab/Parsing/IDFADefinitionBuilder.cs b/Implab/Parsing/IDFADefinitionBuilder.cs deleted file mode 100644 --- a/Implab/Parsing/IDFADefinitionBuilder.cs +++ /dev/null @@ -1,9 +0,0 @@ -using System; - -namespace Implab.Parsing { - public interface IDFADefinitionBuilder : IDFADefinition { - - - } -} - diff --git a/Implab/Parsing/IVisitor.cs b/Implab/Parsing/IVisitor.cs deleted file mode 100644 --- a/Implab/Parsing/IVisitor.cs +++ /dev/null @@ -1,19 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// - /// Интерфейс обходчика синтаксического дерева регулярного выражения - /// - public interface IVisitor { - void Visit(AltToken token); - void Visit(StarToken token); - void Visit(CatToken token); - void Visit(EmptyToken token); - void Visit(EndToken token); - void Visit(SymbolToken token); - } -} diff --git a/Implab/Parsing/IndexedAlphabetBase.cs b/Implab/Parsing/IndexedAlphabetBase.cs deleted file mode 100644 --- a/Implab/Parsing/IndexedAlphabetBase.cs +++ /dev/null @@ -1,107 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// - /// Indexed alphabet is the finite set of symbols where each symbol has a zero-based unique index. - /// - public abstract class IndexedAlphabetBase : IAlphabet { - public const int UNCLASSIFIED = 0; - - int m_nextId = 1; - readonly int[] m_map; - - public int Count { - get { return m_nextId; } - } - - protected IndexedAlphabetBase(int mapSize) { - m_map = new int[mapSize]; - } - - protected IndexedAlphabetBase(int[] map) { - Debug.Assert(map != null); - - m_map = map; - m_nextId = map.Max() + 1; - } - - public int DefineSymbol(T symbol) { - var index = GetSymbolIndex(symbol); - if (m_map[index] == UNCLASSIFIED) - m_map[index] = m_nextId++; - return m_map[index]; - } - - public int DefineClass(IEnumerable symbols) { - Safe.ArgumentNotNull(symbols, "symbols"); - symbols = symbols.Distinct(); - - foreach (var symbol in symbols) { - var index = GetSymbolIndex(symbol); - if (m_map[index] == UNCLASSIFIED) - m_map[GetSymbolIndex(symbol)] = m_nextId; - else - throw new InvalidOperationException(String.Format("Symbol '{0}' already in use", symbol)); - } - return m_nextId++; - } - - public List[] CreateReverseMap() { - return - Enumerable.Range(UNCLASSIFIED, Count) - .Select( - i => InputSymbols - .Where(x => i != UNCLASSIFIED && m_map[GetSymbolIndex(x)] == i) - .ToList() - ) - .ToArray(); - } - - public int[] Reclassify(IAlphabet newAlphabet, IEnumerable> classes) { - Safe.ArgumentNotNull(newAlphabet, "newAlphabet"); - Safe.ArgumentNotNull(classes, "classes"); - var reverseMap = CreateReverseMap(); - - int[] translationMap = new int[Count]; - - foreach (var scl in classes) { - // skip if the supper class contains the unclassified element - if (scl.Contains(UNCLASSIFIED)) - continue; - var range = new List(); - foreach (var cl in scl) { - if (cl < 0 || cl >= reverseMap.Length) - throw new ArgumentOutOfRangeException(String.Format("Class {0} is not valid for the current alphabet", cl)); - range.AddRange(reverseMap[cl]); - } - var newClass = newAlphabet.DefineClass(range); - foreach (var cl in scl) - translationMap[cl] = newClass; - } - - return translationMap; - } - - public virtual int Translate(T symbol) { - return m_map[GetSymbolIndex(symbol)]; - } - - public abstract int GetSymbolIndex(T symbol); - - public abstract IEnumerable InputSymbols { get; } - - /// - /// Gets the translation map from the index of the symbol to it's class this is usefull for the optimized input symbols transtaion. - /// - /// The translation map. - public int[] GetTranslationMap() { - return m_map; - } - } -} diff --git a/Implab/Parsing/ParserException.cs b/Implab/Parsing/ParserException.cs deleted file mode 100644 --- a/Implab/Parsing/ParserException.cs +++ /dev/null @@ -1,17 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -namespace Implab.Parsing { - [Serializable] - public class ParserException : Exception { - public ParserException() { } - public ParserException(string message) : base(message) { } - public ParserException(string message, Exception inner) : base(message, inner) { } - protected ParserException( - System.Runtime.Serialization.SerializationInfo info, - System.Runtime.Serialization.StreamingContext context) - : base(info, context) { } - } -} diff --git a/Implab/Parsing/Scanner.cs b/Implab/Parsing/Scanner.cs deleted file mode 100644 --- a/Implab/Parsing/Scanner.cs +++ /dev/null @@ -1,259 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.IO; -using Implab.Components; - -namespace Implab.Parsing { - /// - /// Базовый класс для разбора потока входных символов на токены. - /// - /// - /// Сканнер имеет внутри буффер с симолами входного текста, по которому перемещаются два - /// указателя, начала и конца токена, при перемещении искользуется ДКА для определения - /// конца токена и допустимости текущего символа. - /// - public abstract class Scanner : Disposable { - struct ScannerConfig { - public DFAStateDescriptior[] states; - public int[] alphabetMap; - } - - Stack m_defs = new Stack(); - - DFAStateDescriptior[] m_states; - int[] m_alphabetMap; - - protected DFAStateDescriptior m_currentState; - int m_previewCode; - - protected int m_tokenLen = 0; - protected int m_tokenOffset; - - protected char[] m_buffer; - protected int m_bufferSize; - protected int m_pointer; - - TextReader m_reader; - bool m_disposeReader; - int m_chunkSize = 1024; // 1k - int m_limit = 10 * 1024 * 1024; // 10Mb - - protected Scanner(DFAStateDescriptior[] states, int[] alphabet) { - Safe.ArgumentNotEmpty(states, "states"); - Safe.ArgumentNotNull(alphabet, "alphabet"); - - m_states = states; - m_alphabetMap = alphabet; - - Feed(new char[0]); - } - - /// - /// Заполняет входными данными буффер. - /// - /// Данные для обработки. - /// Копирование данных не происходит, переданный массив используется в - /// качестве входного буффера. - public void Feed(char[] data) { - Safe.ArgumentNotNull(data, "data"); - - Feed(data, data.Length); - } - - /// - /// Заполняет буффур чтения входными данными. - /// - /// Данные для обработки. - /// Длина данных для обработки. - /// Копирование данных не происходит, переданный массив используется в - /// качестве входного буффера. - public void Feed(char[] data, int length) { - Safe.ArgumentNotNull(data, "data"); - Safe.ArgumentInRange(length, 0, data.Length, "length"); - AssertNotDisposed(); - - m_pointer = -1; - m_buffer = data; - m_bufferSize = length; - Shift(); - } - - public void Feed(TextReader reader, bool dispose) { - Safe.ArgumentNotNull(reader, "reader"); - AssertNotDisposed(); - - if (m_reader != null && m_disposeReader) - m_reader.Dispose(); - - m_reader = reader; - m_disposeReader = dispose; - m_pointer = -1; - m_buffer = new char[m_chunkSize]; - m_bufferSize = 0; - Shift(); - } - - /// - /// Получает текущий токен в виде строки. - /// - /// - protected string GetTokenValue() { - return new String(m_buffer, m_tokenOffset, m_tokenLen); - } - - /// - /// Метки текущего токена, которые были назначены в регулярном выражении. - /// - protected int[] TokenTags { - get { - return m_currentState.tag; - } - } - - /// - /// Признак конца данных - /// - public bool EOF { - get { - return m_pointer >= m_bufferSize; - } - } - - /// - /// Читает следующий токен, при этом указывает на начало токена, - /// на длину токена, - массив символов, в - /// котором находится токен. - /// - /// false - достигнут конец данных, токен не прочитан. - protected bool ReadTokenInternal() { - if (m_pointer >= m_bufferSize) - return false; - - m_currentState = m_states[DFADefinition.INITIAL_STATE]; - m_tokenLen = 0; - m_tokenOffset = m_pointer; - int nextState; - do { - nextState = m_currentState.transitions[m_previewCode]; - if (nextState == DFADefinition.UNREACHEBLE_STATE) { - if (m_currentState.final) - return true; - else - throw new ParserException( - String.Format( - "Unexpected symbol '{0}', at pos {1}", - m_buffer[m_pointer], - Position - ) - ); - } else { - m_currentState = m_states[nextState]; - m_tokenLen++; - } - - } while (Shift()); - - // END OF DATA - if (!m_currentState.final) - throw new ParserException("Unexpected end of data"); - - return true; - } - - - bool Shift() { - m_pointer++; - - if (m_pointer >= m_bufferSize) { - if (!ReadNextChunk()) - return false; - } - - m_previewCode = m_alphabetMap[m_buffer[m_pointer]]; - - return true; - } - - bool ReadNextChunk() { - if (m_reader == null) - return false; - - // extend buffer if nesessary - if (m_pointer + m_chunkSize > m_buffer.Length) { - // trim unused buffer head - var size = m_tokenLen + m_chunkSize; - if (size >= m_limit) - throw new ParserException(String.Format("Input buffer {0} bytes limit exceeded", m_limit)); - var temp = new char[size]; - Array.Copy(m_buffer, m_tokenOffset, temp, 0, m_tokenLen); - m_pointer -= m_tokenOffset; - m_bufferSize -= m_tokenOffset; - m_tokenOffset = 0; - m_buffer = temp; - } - - var read = m_reader.Read(m_buffer, m_tokenLen, m_chunkSize); - if (read == 0) - return false; - - m_bufferSize += read; - - return true; - } - - /// - /// Позиция сканнера во входном буфере - /// - public int Position { - get { - return m_pointer + 1; - } - } - - /// - /// Преключает внутренний ДКА на указанный, позволяет реализовать подобие захватывающей - /// группировки. - /// - /// Таблица состояний нового ДКА - /// Таблица входных символов для нового ДКА - protected void Switch(DFAStateDescriptior[] states, int[] alphabet) { - Safe.ArgumentNotNull(states, "dfa"); - - m_defs.Push(new ScannerConfig { - states = m_states, - alphabetMap = m_alphabetMap - }); - - m_states = states; - m_alphabetMap = alphabet; - - m_previewCode = m_alphabetMap[m_buffer[m_pointer]]; - } - - /// - /// Восстанавливает предыдущей ДКА сканнера. - /// - protected void Restore() { - if (m_defs.Count == 0) - throw new InvalidOperationException(); - var prev = m_defs.Pop(); - m_states = prev.states; - m_alphabetMap = prev.alphabetMap; - m_previewCode = m_alphabetMap[m_buffer[m_pointer]]; - } - - protected override void Dispose(bool disposing) { - if (disposing) { - if (m_reader != null && m_disposeReader) - m_reader.Dispose(); - m_buffer = null; - m_bufferSize = 0; - m_pointer = 0; - m_tokenLen = 0; - m_tokenOffset = 0; - } - base.Dispose(disposing); - } - } -} diff --git a/Implab/Parsing/StarToken.cs b/Implab/Parsing/StarToken.cs deleted file mode 100644 --- a/Implab/Parsing/StarToken.cs +++ /dev/null @@ -1,34 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// - /// Замыкание выражения с 0 и более повторов. - /// - public class StarToken: Token { - - Token m_token; - - public Token Token { - get { return m_token; } - } - - public StarToken(Token token) { - Safe.ArgumentNotNull(token, "token"); - m_token = token; - } - - public override void Accept(IVisitor visitor) { - Safe.ArgumentNotNull(visitor, "visitor"); - visitor.Visit(this); - } - - public override string ToString() { - return String.Format("({0})*", Token.ToString()); - } - } -} diff --git a/Implab/Parsing/SymbolToken.cs b/Implab/Parsing/SymbolToken.cs deleted file mode 100644 --- a/Implab/Parsing/SymbolToken.cs +++ /dev/null @@ -1,33 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - /// - /// Выражение, соответсвующее одному символу. - /// - public class SymbolToken : Token { - int m_value; - - public int Value { - get { return m_value; } - } - - public SymbolToken(int value) { - m_value = value; - } - public override void Accept(IVisitor visitor) { - Safe.ArgumentNotNull(visitor, "visitor"); - - visitor.Visit(this); - - } - - public override string ToString() { - return Value.ToString(); - } - } -} diff --git a/Implab/Parsing/Token.cs b/Implab/Parsing/Token.cs deleted file mode 100644 --- a/Implab/Parsing/Token.cs +++ /dev/null @@ -1,67 +0,0 @@ -using Implab; -using System; -using System.Collections.Generic; -using System.Globalization; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Implab.Parsing { - public abstract class Token { - public abstract void Accept(IVisitor visitor); - - public Token Extend() { - return new CatToken(this, new EndToken()); - } - - public Token Tag(T tag) where T : IConvertible { - return new CatToken(this, new EndToken(tag.ToInt32(CultureInfo.InvariantCulture))); - } - - public Token Cat(Token right) { - return new CatToken(this, right); - } - - public Token Or(Token right) { - return new AltToken(this, right); - } - - public Token Optional() { - return Or(new EmptyToken()); - } - - public Token EClosure() { - return new StarToken(this); - } - - public Token Closure() { - return new CatToken(this, new StarToken(this)); - } - - public Token Repeat(int count) { - Token token = null; - - for (int i = 0; i < count; i++) - token = token != null ? token.Cat(this) : this; - return token ?? new EmptyToken(); - } - - public Token Repeat(int min, int max) { - if (min > max || min < 1) - throw new ArgumentOutOfRangeException(); - var token = Repeat(min); - - for (int i = min; i < max; i++) - token = token.Cat( this.Optional() ); - return token; - } - - public static Token New(params T[] set) where T : struct, IConvertible { - Safe.ArgumentNotNull(set, "set"); - Token token = null; - foreach(var c in set.Distinct()) - token = token == null ? new SymbolToken(c.ToInt32(CultureInfo.InvariantCulture)) : token.Or(new SymbolToken(c.ToInt32(CultureInfo.InvariantCulture))); - return token; - } - } -}