| @@ -0,0 +1,30 | |||
|  | 1 | using System; | |
|  | 2 | using System.IO; | |
|  | 3 | ||
|  | 4 | namespace Implab.Formats { | |
|  | 5 | public class ReaderScanner: TextScanner { | |
|  | 6 | const int CHUNK_SIZE = 1024; | |
|  | 7 | const int BUFFER_MAX = CHUNK_SIZE*1024; | |
|  | 8 | ||
|  | 9 | readonly TextReader m_reader; | |
|  | 10 | ||
|  | 11 | public ReaderScanner(TextReader reader, int limit, int chunk) : base(limit, chunk) { | |
|  | 12 | Safe.ArgumentNotNull(reader, "reader"); | |
|  | 13 | m_reader = reader; | |
|  | 14 | } | |
|  | 15 | ||
|  | 16 | public ReaderScanner(TextReader reader) : this(reader, BUFFER_MAX, CHUNK_SIZE) { | |
|  | 17 | } | |
|  | 18 | ||
|  | 19 | protected override int Read(char[] buffer, int offset, int size) { | |
|  | 20 | return m_reader.Read(buffer, offset, size); | |
|  | 21 | } | |
|  | 22 | ||
|  | 23 | protected override void Dispose(bool disposing) { | |
|  | 24 | if (disposing) | |
|  | 25 | Safe.Dispose(m_reader); | |
|  | 26 | base.Dispose(disposing); | |
|  | 27 | } | |
|  | 28 | } | |
|  | 29 | } | |
|  | 30 | ||
| @@ -0,0 +1,24 | |||
|  | 1 | using System; | |
|  | 2 | ||
|  | 3 | namespace Implab.Formats { | |
|  | 4 | public class ScannerContext<TTag> { | |
|  | 5 | public int[,] Dfa { get; private set; } | |
|  | 6 | public bool[] Final { get; private set; } | |
|  | 7 | public TTag[][] Tags { get; private set; } | |
|  | 8 | public int State { get; private set; } | |
|  | 9 | public int[] Alphabet { get; private set; } | |
|  | 10 | ||
|  | 11 | public ScannerContext(int[,] dfa, bool[] final, TTag[][] tags, int state, int[] alphabet) { | |
|  | 12 | Dfa = dfa; | |
|  | 13 | Final = final; | |
|  | 14 | Tags = tags; | |
|  | 15 | State = state; | |
|  | 16 | Alphabet = alphabet; | |
|  | 17 | } | |
|  | 18 | ||
|  | 19 | public bool Execute(TextScanner scanner, out TTag[] tag) { | |
|  | 20 | return scanner.ReadToken(Dfa, Final, Tags, State, Alphabet, out tag); | |
|  | 21 | } | |
|  | 22 | } | |
|  | 23 | } | |
|  | 24 | ||
| @@ -0,0 +1,26 | |||
|  | 1 | using System; | |
|  | 2 | ||
|  | 3 | namespace Implab.Formats { | |
|  | 4 | public class StringScanner: TextScanner { | |
|  | 5 | const int CHUNK_SIZE = 1024; | |
|  | 6 | ||
|  | 7 | readonly string m_text; | |
|  | 8 | int m_pos; | |
|  | 9 | ||
|  | 10 | public StringScanner(string text) : base(text.Length, text.Length < CHUNK_SIZE ? text.Length : CHUNK_SIZE) { | |
|  | 11 | m_text = text; | |
|  | 12 | Feed(); | |
|  | 13 | } | |
|  | 14 | ||
|  | 15 | protected override int Read(char[] buffer, int offset, int size) { | |
|  | 16 | var actual = size + m_pos > m_text.Length ? m_text.Length - m_pos : size; | |
|  | 17 | ||
|  | 18 | m_text.CopyTo(m_pos,buffer,offset, actual); | |
|  | 19 | ||
|  | 20 | m_pos += actual; | |
|  | 21 | ||
|  | 22 | return actual; | |
|  | 23 | } | |
|  | 24 | } | |
|  | 25 | } | |
|  | 26 | ||
| This diff has been collapsed as it changes many lines, (618 lines changed) Show them Hide them | |||
| @@ -1,305 +1,313 | |||
|  | 1 | using Implab; | |
|  | 2 | using System; | |
|  | 3 | using System.Collections.Generic; | |
|  | 4 | using System.Linq; | |
|  | 5 | ||
|  | 6 | namespace Implab.Automaton { | |
|  | 7 | public class DFATable : IDFATableBuilder { | |
|  | 8 | int m_stateCount; | |
|  | 9 | int m_symbolCount; | |
|  | 10 | int m_initialState; | |
|  | 11 | ||
|  | 12 | readonly HashSet<int> m_finalStates = new HashSet<int>(); | |
|  | 13 | readonly HashSet<AutomatonTransition> m_transitions = new HashSet<AutomatonTransition>(); | |
|  | 14 | ||
|  | 15 | ||
|  | 16 | #region IDFADefinition implementation | |
|  | 17 | ||
|  | 18 | public bool IsFinalState(int s) { | |
|  | 19 | Safe.ArgumentInRange(s, 0, m_stateCount, "s"); | |
|  | 20 | ||
|  | 21 | return m_finalStates.Contains(s); | |
|  | 22 | } | |
|  | 23 | ||
|  | 24 | public IEnumerable<int> FinalStates { | |
|  | 25 | get { | |
|  | 26 | return m_finalStates; | |
|  | 27 | } | |
|  | 28 | } | |
|  | 29 | ||
|  | 30 | public int StateCount { | |
|  | 31 | get { return m_stateCount; } | |
|  | 32 | } | |
|  | 33 | ||
|  | 34 | public int AlphabetSize { | |
|  | 35 | get { return m_symbolCount; } | |
|  | 36 | } | |
|  | 37 | ||
|  | 38 | public int InitialState { | |
|  | 39 | get { return m_initialState; } | |
|  | 40 | } | |
|  | 41 | ||
|  | 42 | #endregion | |
|  | 43 | ||
|  | 44 | public void SetInitialState(int s) { | |
|  | 45 | Safe.ArgumentAssert(s >= 0, "s"); | |
|  | 46 | m_initialState = s; | |
|  | 47 | } | |
|  | 48 | ||
|  | 49 | public void MarkFinalState(int state) { | |
|  | 50 | m_finalStates.Add(state); | |
|  | 51 | } | |
|  | 52 | ||
|  | 53 | public void Add(AutomatonTransition item) { | |
|  | 54 | Safe.ArgumentAssert(item.s1 >= 0, "item"); | |
|  | 55 | Safe.ArgumentAssert(item.s2 >= 0, "item"); | |
|  | 56 | Safe.ArgumentAssert(item.edge >= 0, "item"); | |
|  | 57 | ||
|  | 58 | m_stateCount = Math.Max(m_stateCount, Math.Max(item.s1, item.s2) + 1); | |
|  | 59 | m_symbolCount = Math.Max(m_symbolCount, item.edge); | |
|  | 60 | ||
|  | 61 | m_transitions.Add(item); | |
|  | 62 | } | |
|  | 63 | ||
|  | 64 | public void Clear() { | |
|  | 65 | m_stateCount = 0; | |
|  | 66 | m_symbolCount = 0; | |
|  | 67 | m_finalStates.Clear(); | |
|  | 68 | m_transitions.Clear(); | |
|  | 69 | } | |
|  | 70 | ||
|  | 71 | public bool Contains(AutomatonTransition item) { | |
|  | 72 | return m_transitions.Contains(item); | |
|  | 73 | } | |
|  | 74 | ||
|  | 75 | public void CopyTo(AutomatonTransition[] array, int arrayIndex) { | |
|  | 76 | m_transitions.CopyTo(array, arrayIndex); | |
|  | 77 | } | |
|  | 78 | ||
|  | 79 | public bool Remove(AutomatonTransition item) { | |
|  | 80 | m_transitions.Remove(item); | |
|  | 81 | } | |
|  | 82 | ||
|  | 83 | public int Count { | |
|  | 84 | get { | |
|  | 85 | return m_transitions.Count; | |
|  | 86 | } | |
|  | 87 | } | |
|  | 88 | ||
|  | 89 | public bool IsReadOnly { | |
|  | 90 | get { | |
|  | 91 | return false; | |
|  | 92 | } | |
|  | 93 | } | |
|  | 94 | ||
|  | 95 | public IEnumerator<AutomatonTransition> GetEnumerator() { | |
|  | 96 | return m_transitions.GetEnumerator(); | |
|  | 97 | } | |
|  | 98 | ||
|  | 99 | System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() { | |
|  | 100 | return GetEnumerator(); | |
|  | 101 | } | |
|  | 102 | ||
|  | 103 | public | |
|  | 104 | var table = new | |
|  | 105 | ||
|  | 106 | foreach (var t in this) { | |
|  | 107 | if (table[t.s1].transitions == null) | |
|  | 108 | table[t.s1] = new DFAStateDescriptor(AlphabetSize, IsFinalState(t.s1)); | |
|  | 109 | if (table[t.s2].transitions == null) | |
|  | 110 | table[t.s2] = new DFAStateDescriptor(AlphabetSize, IsFinalState(t.s2)); | |
|  | 111 | table[t.s1 | |
|  | 112 | } | |
|  | 113 | ||
|  | 114 | return table; | |
|  | 115 | } | |
|  | 116 | ||
|  | 117 | /// <summary>Формирует множества конечных состояний перед началом работы алгоритма минимизации.</summary> | |
|  | 118 | /// <remarks> | |
|  | 119 | /// В процессе построения минимального автомата требуется разделить множество состояний, | |
|  | 120 | /// на два подмножества - конечные состояния и все остальные, после чего эти подмножества | |
|  | 121 | /// будут резделены на более мелкие. Иногда требуется гарантировать различия конечных сосотяний, | |
|  | 122 | /// для этого необходимо переопределить даннцю фукнцию, для получения множеств конечных состояний. | |
|  | 123 | /// </remarks> | |
|  | 124 | /// <returns>The final states.</returns> | |
|  | 125 | protected virtual IEnumerable<HashSet<int>> GroupFinalStates() { | |
|  | 126 | return new HashSet<int>[] { m_finalStates }; | |
|  | 127 | } | |
|  | 128 | ||
|  | 129 | protected void Optimize( | |
|  | 130 | IDFATableBuilder optimalDFA, | |
|  | 131 | IDictionary<int,int> alphabetMap, | |
|  | 132 | IDictionary<int,int> stateMap | |
|  | 133 | ) { | |
|  | 134 | Safe.ArgumentNotNull(optimalDFA, "dfa"); | |
|  | 135 | Safe.ArgumentNotNull(alphabetMap, "alphabetMap"); | |
|  | 136 | Safe.ArgumentNotNull(stateMap, "stateMap"); | |
|  | 137 | ||
|  | 138 | ||
|  | 139 | var setComparer = new CustomEqualityComparer<HashSet<int>>( | |
|  | 140 | (x, y) => x.SetEquals(y), | |
|  | 141 | s => s.Sum(x => x.GetHashCode()) | |
|  | 142 | ); | |
|  | 143 | ||
|  | 144 | var optimalStates = new HashSet<HashSet<int>>(setComparer); | |
|  | 145 | var queue = new HashSet<HashSet<int>>(setComparer); | |
|  | 146 | ||
|  | 147 | // получаем конечные состояния, сгруппированные по маркерам | |
|  | 148 | optimalStates.UnionWith( | |
|  | 149 | GroupFinalStates() | |
|  | 150 | ); | |
|  | 151 | ||
|  | 152 | var | |
|  | 153 | Enumerable | |
|  | 154 | .Range(0, m_stateCount - 1) | |
|  | 155 | .Where(i => !m_finalStates.Contains(i)) | |
|  | 156 | ); | |
|  | 157 | ||
|  | 158 | optimalStates.Add(state); | |
|  | 159 | queue.Add(state); | |
|  | 160 | ||
|  | 161 | var rmap = m_transitions | |
|  | 162 | .GroupBy(t => t.s2) | |
|  | 163 | .ToLookup( | |
|  | 164 | g => g.Key, // s2 | |
|  | 165 | g => g.ToLookup(t => t.edge, t => t.s1) | |
|  | 166 | ); | |
|  | 167 | ||
|  | 168 | while (queue.Count > 0) { | |
|  | 169 | var stateA = queue.First(); | |
|  | 170 | queue.Remove(stateA); | |
|  | 171 | ||
|  | 172 | for (int c = 0; c < m_symbolCount; c++) { | |
|  | 173 | var stateX = new HashSet<int>(); | |
|  | 174 | foreach(var a in stateA) | |
|  | 175 | stateX.UnionWith(rmap[a][c]); // all states from wich 'c' leads to 'a' | |
|  | 176 | ||
|  | 177 | foreach (var stateY in optimalStates.ToArray()) { | |
|  | 178 | if (stateX.Overlaps(stateY) && !stateY.IsSubsetOf(stateX)) { | |
|  | 179 | var stateR1 = new HashSet<int>(stateY); | |
|  | 180 | var stateR2 = new HashSet<int>(stateY); | |
|  | 181 | ||
|  | 182 | stateR1.IntersectWith(stateX); | |
|  | 183 | stateR2.ExceptWith(stateX); | |
|  | 184 | ||
|  | 185 |  | |
|  | 186 | optimalStates.Add(stateR1); | |
|  | 187 |  | |
|  | 188 | ||
|  | 189 | if (queue.Contains(stateY)) { | |
|  | 190 |  | |
|  | 191 |  | |
|  | 192 | queue.Add(stateR2); | |
|  | 193 |  | |
|  | 194 | queue.Add(stateR1.Count <= stateR2.Count ? stateR1 : stateR2); | |
|  | 195 |  | |
|  | 196 | } | |
|  | 197 | } | |
|  | 198 | } | |
|  | 199 | } | |
|  | 200 | ||
|  | 201 | // карта получения оптимального состояния по соотвествующему ему простому состоянию | |
|  | 202 | var nextState = 0; | |
|  | 203 | foreach (var item in optimalStates) { | |
|  | 204 |  | |
|  | 205 |  | |
|  | 206 | stateMap[s] = id; | |
|  | 207 | } | |
|  | 208 | ||
|  | 209 | // получаем минимальный алфавит | |
|  | 210 | // входные символы не различимы, если Move(s,a1) == Move(s,a2), для любого s | |
|  | 211 | // для этого используем алгоритм кластеризации, сначала | |
|  | 212 | // считаем, что все символы не различимы | |
|  | 213 | ||
|  | 214 | var minClasses = new HashSet<HashSet<int>>(setComparer); | |
|  | 215 | var alphaQueue = new Queue<HashSet<int>>(); | |
|  | 216 | alphaQueue.Enqueue(new HashSet<int>(Enumerable.Range(0,AlphabetSize))); | |
|  | 217 | ||
|  | 218 | // для всех состояний, будем проверять каждый класс на различимость, | |
|  | 219 | // т.е. символы различимы, если они приводят к разным состояниям | |
|  | 220 | for (int s = 0 ; s < optimalStates.Count; s++) { | |
|  | 221 | var newQueue = new Queue<HashSet<int>>(); | |
|  | 222 | ||
|  | 223 | foreach (var A in alphaQueue) { | |
|  | 224 | // классы из одного символа делить бесполезно, переводим их сразу в | |
|  | 225 | // результирующий алфавит | |
|  | 226 | if (A.Count == 1) { | |
|  | 227 | minClasses.Add(A); | |
|  | 228 | continue; | |
|  | 229 | } | |
|  | 230 | ||
|  | 231 | // различаем классы символов, которые переводят в различные оптимальные состояния | |
|  | 232 | // optimalState -> alphaClass | |
|  | 233 | var classes = new Dictionary<int, HashSet<int>>(); | |
|  | 234 | ||
|  | 235 |  | |
|  | 236 | // ищем все переходы класса по символу term | |
|  | 237 | var res = m_transitions.Where(t => stateMap[t.s1] == s && t.edge == term).Select(t => stateMap[t.s2]).ToArray(); | |
|  | 238 | ||
|  | 239 | var s2 = res.Length > 0 ? res[0] : -1; | |
|  | 240 | ||
|  | 241 | HashSet<int> | |
|  | 242 | if (!classes.TryGetValue(s2, out a2)) { | |
|  | 243 | a2 = new HashSet<int>(); | |
|  | 244 | newQueue.Enqueue(a2); | |
|  | 245 | classes[s2] = a2; | |
|  | 246 | } | |
|  | 247 | a2. | |
|  | 248 |  | |
|  | 249 | } | |
|  | 250 | ||
|  | 251 | if (newQueue.Count == 0) | |
|  | 252 |  | |
|  | 253 | alphaQueue = newQueue; | |
|  | 254 | } | |
|  | 255 | ||
|  | 256 | // после окончания работы алгоритма в очереди останутся минимальные различимые классы | |
|  | 257 | // входных символов | |
|  | 258 | foreach (var A in alphaQueue) | |
|  | 259 | minClasses.Add(A); | |
|  | 260 | ||
|  | 261 | // построение отображения алфавитов входных символов. | |
|  | 262 | // поскольку символ DFAConst.UNCLASSIFIED_INPUT может иметь | |
|  | 263 | // специальное значение, тогда сохраним минимальный класс, | |
|  | 264 | // содержащий этот символ на томже месте. | |
|  | 265 | ||
|  | 266 | var nextCls = 0; | |
|  | 267 |  | |
|  | 268 | if (nextCls == DFAConst.UNCLASSIFIED_INPUT) | |
|  | 269 | nextCls++; | |
|  | 270 | ||
|  | 271 | // сохраняем DFAConst.UNCLASSIFIED_INPUT | |
|  | 272 | var cls = item.Contains(DFAConst.UNCLASSIFIED_INPUT) ? DFAConst.UNCLASSIFIED_INPUT : nextCls; | |
|  | 273 | ||
|  | 274 | foreach (var a in item) | |
|  | 275 | alphabetMap[a] = cls; | |
|  | 276 | ||
|  | 277 | nextCls++; | |
|  | 278 | } | |
|  | 279 | ||
|  | 280 | // построение автомата | |
|  | 281 | optimalDFA.SetInitialState(stateMap[m_initialState]); | |
|  | 282 | ||
|  | 283 | foreach (var sf in m_finalStates.Select(s => stateMap[s]).Distinct()) | |
|  | 284 | optimalDFA.MarkFinalState(sf); | |
|  | 285 | ||
|  | 286 | foreach (var t in m_transitions.Select(t => new AutomatonTransition(stateMap[t.s1],stateMap[t.s2],alphabetMap[t.edge])).Distinct()) | |
|  | 287 | optimalDFA.Add(t); | |
|  | 288 | } | |
|  | 289 | ||
|  | 290 | protected void PrintDFA<TInput, TState>(IAlphabet<TInput> inputAlphabet, IAlphabet<TState> stateAlphabet) { | |
|  | 291 | Safe.ArgumentNotNull(inputAlphabet, "inputAlphabet"); | |
|  | 292 | Safe.ArgumentNotNull(stateAlphabet, "stateAlphabet"); | |
|  | 293 | ||
|  | 294 | foreach(var t in m_transitions) | |
|  | 295 | Console.WriteLine( | |
|  | 296 | "[{0}] -{{{1}}}-> [{2}]{3}", | |
|  | 297 | String.Join(",", stateAlphabet.GetSymbols(t.s1)), | |
|  | 298 | String.Join("", inputAlphabet.GetSymbols(t.edge)), | |
|  | 299 | String.Join(",", stateAlphabet.GetSymbols(t.s2)), | |
|  | 300 | m_finalStates.Contains(t.s2) ? "$" : "" | |
|  | 301 | ); | |
|  | 302 | } | |
|  | 303 | ||
|  | 304 | } | |
|  | 305 | } | |
|  | 1 | using Implab; | |
|  | 2 | using System; | |
|  | 3 | using System.Collections.Generic; | |
|  | 4 | using System.Linq; | |
|  | 5 | ||
|  | 6 | namespace Implab.Automaton { | |
|  | 7 | public class DFATable : IDFATableBuilder { | |
|  | 8 | int m_stateCount; | |
|  | 9 | int m_symbolCount; | |
|  | 10 | int m_initialState; | |
|  | 11 | ||
|  | 12 | readonly HashSet<int> m_finalStates = new HashSet<int>(); | |
|  | 13 | readonly HashSet<AutomatonTransition> m_transitions = new HashSet<AutomatonTransition>(); | |
|  | 14 | ||
|  | 15 | ||
|  | 16 | #region IDFADefinition implementation | |
|  | 17 | ||
|  | 18 | public bool IsFinalState(int s) { | |
|  | 19 | Safe.ArgumentInRange(s, 0, m_stateCount, "s"); | |
|  | 20 | ||
|  | 21 | return m_finalStates.Contains(s); | |
|  | 22 | } | |
|  | 23 | ||
|  | 24 | public IEnumerable<int> FinalStates { | |
|  | 25 | get { | |
|  | 26 | return m_finalStates; | |
|  | 27 | } | |
|  | 28 | } | |
|  | 29 | ||
|  | 30 | public int StateCount { | |
|  | 31 | get { return m_stateCount; } | |
|  | 32 | } | |
|  | 33 | ||
|  | 34 | public int AlphabetSize { | |
|  | 35 | get { return m_symbolCount; } | |
|  | 36 | } | |
|  | 37 | ||
|  | 38 | public int InitialState { | |
|  | 39 | get { return m_initialState; } | |
|  | 40 | } | |
|  | 41 | ||
|  | 42 | #endregion | |
|  | 43 | ||
|  | 44 | public void SetInitialState(int s) { | |
|  | 45 | Safe.ArgumentAssert(s >= 0, "s"); | |
|  | 46 | m_initialState = s; | |
|  | 47 | } | |
|  | 48 | ||
|  | 49 | public void MarkFinalState(int state) { | |
|  | 50 | m_finalStates.Add(state); | |
|  | 51 | } | |
|  | 52 | ||
|  | 53 | public void Add(AutomatonTransition item) { | |
|  | 54 | Safe.ArgumentAssert(item.s1 >= 0, "item"); | |
|  | 55 | Safe.ArgumentAssert(item.s2 >= 0, "item"); | |
|  | 56 | Safe.ArgumentAssert(item.edge >= 0, "item"); | |
|  | 57 | ||
|  | 58 | m_stateCount = Math.Max(m_stateCount, Math.Max(item.s1, item.s2) + 1); | |
|  | 59 | m_symbolCount = Math.Max(m_symbolCount, item.edge); | |
|  | 60 | ||
|  | 61 | m_transitions.Add(item); | |
|  | 62 | } | |
|  | 63 | ||
|  | 64 | public void Clear() { | |
|  | 65 | m_stateCount = 0; | |
|  | 66 | m_symbolCount = 0; | |
|  | 67 | m_finalStates.Clear(); | |
|  | 68 | m_transitions.Clear(); | |
|  | 69 | } | |
|  | 70 | ||
|  | 71 | public bool Contains(AutomatonTransition item) { | |
|  | 72 | return m_transitions.Contains(item); | |
|  | 73 | } | |
|  | 74 | ||
|  | 75 | public void CopyTo(AutomatonTransition[] array, int arrayIndex) { | |
|  | 76 | m_transitions.CopyTo(array, arrayIndex); | |
|  | 77 | } | |
|  | 78 | ||
|  | 79 | public bool Remove(AutomatonTransition item) { | |
|  | 80 | m_transitions.Remove(item); | |
|  | 81 | } | |
|  | 82 | ||
|  | 83 | public int Count { | |
|  | 84 | get { | |
|  | 85 | return m_transitions.Count; | |
|  | 86 | } | |
|  | 87 | } | |
|  | 88 | ||
|  | 89 | public bool IsReadOnly { | |
|  | 90 | get { | |
|  | 91 | return false; | |
|  | 92 | } | |
|  | 93 | } | |
|  | 94 | ||
|  | 95 | public IEnumerator<AutomatonTransition> GetEnumerator() { | |
|  | 96 | return m_transitions.GetEnumerator(); | |
|  | 97 | } | |
|  | 98 | ||
|  | 99 | System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() { | |
|  | 100 | return GetEnumerator(); | |
|  | 101 | } | |
|  | 102 | ||
|  | 103 | public int[,] CreateTransitionTable() { | |
|  | 104 | var table = new int[StateCount,AlphabetSize]; | |
|  | 105 | ||
|  | 106 | for (int i = 0; i < StateCount; i++) | |
|  | 107 | for (int j = 0; i < AlphabetSize; j++) | |
|  | 108 | table[i, j] = DFAConst.UNREACHABLE_STATE; | |
|  | 109 | ||
|  | 110 | foreach (var t in this) | |
|  | 111 | table[t.s1,t.edge] = t.s2; | |
|  | 112 | ||
|  | 113 | return table; | |
|  | 114 | } | |
|  | 115 | ||
|  | 116 | public bool[] CreateFinalStateTable() { | |
|  | 117 | var table = new bool[StateCount]; | |
|  | 118 | ||
|  | 119 | foreach (var s in FinalStates) | |
|  | 120 | table[s] = true; | |
|  | 121 | ||
|  | 122 | return table; | |
|  | 123 | } | |
|  | 124 | ||
|  | 125 | /// <summary>Формирует множества конечных состояний перед началом работы алгоритма минимизации.</summary> | |
|  | 126 | /// <remarks> | |
|  | 127 | /// В процессе построения минимального автомата требуется разделить множество состояний, | |
|  | 128 | /// на два подмножества - конечные состояния и все остальные, после чего эти подмножества | |
|  | 129 | /// будут резделены на более мелкие. Иногда требуется гарантировать различия конечных сосотяний, | |
|  | 130 | /// для этого необходимо переопределить даннцю фукнцию, для получения множеств конечных состояний. | |
|  | 131 | /// </remarks> | |
|  | 132 | /// <returns>The final states.</returns> | |
|  | 133 | protected virtual IEnumerable<HashSet<int>> GroupFinalStates() { | |
|  | 134 | return new HashSet<int>[] { m_finalStates }; | |
|  | 135 | } | |
|  | 136 | ||
|  | 137 | protected void Optimize( | |
|  | 138 | IDFATableBuilder optimalDFA, | |
|  | 139 | IDictionary<int,int> alphabetMap, | |
|  | 140 | IDictionary<int,int> stateMap | |
|  | 141 | ) { | |
|  | 142 | Safe.ArgumentNotNull(optimalDFA, "dfa"); | |
|  | 143 | Safe.ArgumentNotNull(alphabetMap, "alphabetMap"); | |
|  | 144 | Safe.ArgumentNotNull(stateMap, "stateMap"); | |
|  | 145 | ||
|  | 146 | ||
|  | 147 | var setComparer = new CustomEqualityComparer<HashSet<int>>( | |
|  | 148 | (x, y) => x.SetEquals(y), | |
|  | 149 | s => s.Sum(x => x.GetHashCode()) | |
|  | 150 | ); | |
|  | 151 | ||
|  | 152 | var optimalStates = new HashSet<HashSet<int>>(setComparer); | |
|  | 153 | var queue = new HashSet<HashSet<int>>(setComparer); | |
|  | 154 | ||
|  | 155 | // получаем конечные состояния, сгруппированные по маркерам | |
|  | 156 | optimalStates.UnionWith( | |
|  | 157 | GroupFinalStates() | |
|  | 158 | ); | |
|  | 159 | ||
|  | 160 | var state = new HashSet<int>( | |
|  | 161 | Enumerable | |
|  | 162 | .Range(0, m_stateCount - 1) | |
|  | 163 | .Where(i => !m_finalStates.Contains(i)) | |
|  | 164 | ); | |
|  | 165 | ||
|  | 166 | optimalStates.Add(state); | |
|  | 167 | queue.Add(state); | |
|  | 168 | ||
|  | 169 | var rmap = m_transitions | |
|  | 170 | .GroupBy(t => t.s2) | |
|  | 171 | .ToLookup( | |
|  | 172 | g => g.Key, // s2 | |
|  | 173 | g => g.ToLookup(t => t.edge, t => t.s1) | |
|  | 174 | ); | |
|  | 175 | ||
|  | 176 | while (queue.Count > 0) { | |
|  | 177 | var stateA = queue.First(); | |
|  | 178 | queue.Remove(stateA); | |
|  | 179 | ||
|  | 180 | for (int c = 0; c < m_symbolCount; c++) { | |
|  | 181 | var stateX = new HashSet<int>(); | |
|  | 182 | foreach(var a in stateA) | |
|  | 183 | stateX.UnionWith(rmap[a][c]); // all states from wich 'c' leads to 'a' | |
|  | 184 | ||
|  | 185 | foreach (var stateY in optimalStates.ToArray()) { | |
|  | 186 | if (stateX.Overlaps(stateY) && !stateY.IsSubsetOf(stateX)) { | |
|  | 187 | var stateR1 = new HashSet<int>(stateY); | |
|  | 188 | var stateR2 = new HashSet<int>(stateY); | |
|  | 189 | ||
|  | 190 | stateR1.IntersectWith(stateX); | |
|  | 191 | stateR2.ExceptWith(stateX); | |
|  | 192 | ||
|  | 193 | optimalStates.Remove(stateY); | |
|  | 194 | optimalStates.Add(stateR1); | |
|  | 195 | optimalStates.Add(stateR2); | |
|  | 196 | ||
|  | 197 | if (queue.Contains(stateY)) { | |
|  | 198 | queue.Remove(stateY); | |
|  | 199 | queue.Add(stateR1); | |
|  | 200 | queue.Add(stateR2); | |
|  | 201 | } else { | |
|  | 202 | queue.Add(stateR1.Count <= stateR2.Count ? stateR1 : stateR2); | |
|  | 203 | } | |
|  | 204 | } | |
|  | 205 | } | |
|  | 206 | } | |
|  | 207 | } | |
|  | 208 | ||
|  | 209 | // карта получения оптимального состояния по соотвествующему ему простому состоянию | |
|  | 210 | var nextState = 0; | |
|  | 211 | foreach (var item in optimalStates) { | |
|  | 212 | var id = nextState++; | |
|  | 213 | foreach (var s in item) | |
|  | 214 | stateMap[s] = id; | |
|  | 215 | } | |
|  | 216 | ||
|  | 217 | // получаем минимальный алфавит | |
|  | 218 | // входные символы не различимы, если Move(s,a1) == Move(s,a2), для любого s | |
|  | 219 | // для этого используем алгоритм кластеризации, сначала | |
|  | 220 | // считаем, что все символы не различимы | |
|  | 221 | ||
|  | 222 | var minClasses = new HashSet<HashSet<int>>(setComparer); | |
|  | 223 | var alphaQueue = new Queue<HashSet<int>>(); | |
|  | 224 | alphaQueue.Enqueue(new HashSet<int>(Enumerable.Range(0,AlphabetSize))); | |
|  | 225 | ||
|  | 226 | // для всех состояний, будем проверять каждый класс на различимость, | |
|  | 227 | // т.е. символы различимы, если они приводят к разным состояниям | |
|  | 228 | for (int s = 0 ; s < optimalStates.Count; s++) { | |
|  | 229 | var newQueue = new Queue<HashSet<int>>(); | |
|  | 230 | ||
|  | 231 | foreach (var A in alphaQueue) { | |
|  | 232 | // классы из одного символа делить бесполезно, переводим их сразу в | |
|  | 233 | // результирующий алфавит | |
|  | 234 | if (A.Count == 1) { | |
|  | 235 | minClasses.Add(A); | |
|  | 236 | continue; | |
|  | 237 | } | |
|  | 238 | ||
|  | 239 | // различаем классы символов, которые переводят в различные оптимальные состояния | |
|  | 240 | // optimalState -> alphaClass | |
|  | 241 | var classes = new Dictionary<int, HashSet<int>>(); | |
|  | 242 | ||
|  | 243 | foreach (var term in A) { | |
|  | 244 | // ищем все переходы класса по символу term | |
|  | 245 | var res = m_transitions.Where(t => stateMap[t.s1] == s && t.edge == term).Select(t => stateMap[t.s2]).ToArray(); | |
|  | 246 | ||
|  | 247 | var s2 = res.Length > 0 ? res[0] : -1; | |
|  | 248 | ||
|  | 249 | HashSet<int> a2; | |
|  | 250 | if (!classes.TryGetValue(s2, out a2)) { | |
|  | 251 | a2 = new HashSet<int>(); | |
|  | 252 | newQueue.Enqueue(a2); | |
|  | 253 | classes[s2] = a2; | |
|  | 254 | } | |
|  | 255 | a2.Add(term); | |
|  | 256 | } | |
|  | 257 | } | |
|  | 258 | ||
|  | 259 | if (newQueue.Count == 0) | |
|  | 260 | break; | |
|  | 261 | alphaQueue = newQueue; | |
|  | 262 | } | |
|  | 263 | ||
|  | 264 | // после окончания работы алгоритма в очереди останутся минимальные различимые классы | |
|  | 265 | // входных символов | |
|  | 266 | foreach (var A in alphaQueue) | |
|  | 267 | minClasses.Add(A); | |
|  | 268 | ||
|  | 269 | // построение отображения алфавитов входных символов. | |
|  | 270 | // поскольку символ DFAConst.UNCLASSIFIED_INPUT может иметь | |
|  | 271 | // специальное значение, тогда сохраним минимальный класс, | |
|  | 272 | // содержащий этот символ на томже месте. | |
|  | 273 | ||
|  | 274 | var nextCls = 0; | |
|  | 275 | foreach (var item in minClasses) { | |
|  | 276 | if (nextCls == DFAConst.UNCLASSIFIED_INPUT) | |
|  | 277 | nextCls++; | |
|  | 278 | ||
|  | 279 | // сохраняем DFAConst.UNCLASSIFIED_INPUT | |
|  | 280 | var cls = item.Contains(DFAConst.UNCLASSIFIED_INPUT) ? DFAConst.UNCLASSIFIED_INPUT : nextCls; | |
|  | 281 | ||
|  | 282 | foreach (var a in item) | |
|  | 283 | alphabetMap[a] = cls; | |
|  | 284 | ||
|  | 285 | nextCls++; | |
|  | 286 | } | |
|  | 287 | ||
|  | 288 | // построение автомата | |
|  | 289 | optimalDFA.SetInitialState(stateMap[m_initialState]); | |
|  | 290 | ||
|  | 291 | foreach (var sf in m_finalStates.Select(s => stateMap[s]).Distinct()) | |
|  | 292 | optimalDFA.MarkFinalState(sf); | |
|  | 293 | ||
|  | 294 | foreach (var t in m_transitions.Select(t => new AutomatonTransition(stateMap[t.s1],stateMap[t.s2],alphabetMap[t.edge])).Distinct()) | |
|  | 295 | optimalDFA.Add(t); | |
|  | 296 | } | |
|  | 297 | ||
|  | 298 | protected void PrintDFA<TInput, TState>(IAlphabet<TInput> inputAlphabet, IAlphabet<TState> stateAlphabet) { | |
|  | 299 | Safe.ArgumentNotNull(inputAlphabet, "inputAlphabet"); | |
|  | 300 | Safe.ArgumentNotNull(stateAlphabet, "stateAlphabet"); | |
|  | 301 | ||
|  | 302 | foreach(var t in m_transitions) | |
|  | 303 | Console.WriteLine( | |
|  | 304 | "[{0}] -{{{1}}}-> [{2}]{3}", | |
|  | 305 | String.Join(",", stateAlphabet.GetSymbols(t.s1)), | |
|  | 306 | String.Join("", inputAlphabet.GetSymbols(t.edge)), | |
|  | 307 | String.Join(",", stateAlphabet.GetSymbols(t.s2)), | |
|  | 308 | m_finalStates.Contains(t.s2) ? "$" : "" | |
|  | 309 | ); | |
|  | 310 | } | |
|  | 311 | ||
|  | 312 | } | |
|  | 313 | } | |
| @@ -13,82 +13,38 namespace Implab.Automaton { | |||
|  | 13 | 13 | /// to the input alphabet of the automaton. It's assumed that the index to the symbol match | 
|  | 14 | 14 | /// is well known and documented. | 
|  | 15 | 15 | /// </remarks> | 
|  | 16 | public abstract class IndexedAlphabetBase<T> : | |
|  | 17 | int m_nextId = 1; | |
|  | 18 | readonly int[] m_map; | |
|  | 19 | ||
|  | 20 | protected IndexedAlphabetBase(int mapSize) { | |
|  | 21 | m_map = new int[mapSize]; | |
|  | 22 | } | |
|  | 23 | ||
|  | 24 | protected IndexedAlphabetBase(int[] map) { | |
|  | 25 | Debug.Assert(map != null && map.Length > 0); | |
|  | 26 | Debug.Assert(map.All(x => x >= 0)); | |
|  | 27 | ||
|  | 28 | m_map = map; | |
|  | 29 | m_nextId = map.Max() + 1; | |
|  | 30 | } | |
|  | 31 | ||
|  | 32 | public int DefineSymbol(T symbol) { | |
|  | 33 | var index = GetSymbolIndex(symbol); | |
|  | 34 | if (m_map[index] == DFAConst.UNCLASSIFIED_INPUT) | |
|  | 35 | m_map[index] = m_nextId++; | |
|  | 36 | return m_map[index]; | |
|  | 37 | } | |
|  | 38 | ||
|  | 39 | public int DefineSymbol(T symbol, int cls) { | |
|  | 40 | var index = GetSymbolIndex(symbol); | |
|  | 41 | m_map[index] = cls; | |
|  | 42 | m_nextId = Math.Max(cls + 1, m_nextId); | |
|  | 43 | return cls; | |
|  | 44 | } | |
|  | 16 | public abstract class IndexedAlphabetBase<T> : MapAlphabet<T> { | |
|  | 45 | 17 | |
|  | 46 | public int DefineClass(IEnumerable<T> symbols) { | |
|  | 47 | return DefineClass(symbols, m_nextId); | |
|  | 48 | } | |
|  | 49 | ||
|  | 50 | public int DefineClass(IEnumerable<T> symbols, int cls) { | |
|  | 51 | Safe.ArgumentNotNull(symbols, "symbols"); | |
|  | 52 | symbols = symbols.Distinct(); | |
|  | 53 | ||
|  | 54 | foreach (var symbol in symbols) | |
|  | 55 | m_map[GetSymbolIndex(symbol)] = cls; | |
|  | 56 | ||
|  | 57 | m_nextId = Math.Max(cls + 1, m_nextId); | |
|  | 58 | ||
|  | 59 | return cls; | |
|  | 60 | } | |
|  | 61 | ||
|  | 62 | public virtual int Translate(T symbol) { | |
|  | 63 | return m_map[GetSymbolIndex(symbol)]; | |
|  | 64 | } | |
|  | 65 | ||
|  | 66 | public int Count { | |
|  | 67 | get { return m_nextId; } | |
|  | 68 | } | |
|  | 69 | ||
|  | 70 | public bool Contains(T symbol) { | |
|  | 71 | return true; | |
|  | 72 | } | |
|  | 73 | ||
|  | 74 | public IEnumerable<T> GetSymbols(int cls) { | |
|  | 75 | for (var i = 0; i < m_map.Length; i++) | |
|  | 76 | if (m_map[i] == cls) | |
|  | 77 | yield return GetSymbolByIndex(i); | |
|  | 18 | protected IndexedAlphabetBase() :base(true, null) { | |
|  | 78 | 19 | } | 
|  | 79 | 20 | |
|  | 80 | 21 | public abstract int GetSymbolIndex(T symbol); | 
|  | 81 | 22 | |
|  | 82 | public abstract T GetSymbolByIndex(int index); | |
|  | 83 | ||
|  | 84 | public abstract IEnumerable<T> InputSymbols { get; } | |
|  | 85 | ||
|  | 86 | 23 | /// <summary> | 
|  | 87 | 24 | /// Gets the translation map from the index of the symbol to it's class this is usefull for the optimized input symbols transtaion. | 
|  | 88 | 25 | /// </summary> | 
|  | 26 | /// <remarks> | |
|  | 27 | /// The map is continous and start from the symbol with zero code. The last symbol | |
|  | 28 | /// in the map is the last classified symbol in the alphabet, i.e. the map can be | |
|  | 29 | /// shorter then the whole alphabet. | |
|  | 30 | /// </remarks> | |
|  | 89 | 31 | /// <returns>The translation map.</returns> | 
|  | 90 | 32 | public int[] GetTranslationMap() { | 
|  | 91 | return m_map; | |
|  | 33 | Dictionary<int,int> map = new Dictionary<int, int>(); | |
|  | 34 | ||
|  | 35 | int max; | |
|  | 36 | foreach (var p in Mappings) { | |
|  | 37 | var index = GetSymbolIndex(p.Key); | |
|  | 38 | max = Math.Max(max, index); | |
|  | 39 | map[index] = p.Value; | |
|  | 40 | } | |
|  | 41 | ||
|  | 42 | var result = new int[max + 1]; | |
|  | 43 | ||
|  | 44 | for (int i = 0; i < result.Length; i++) | |
|  | 45 | map.TryGetValue(i, out result[i]); | |
|  | 46 | ||
|  | 47 | return result; | |
|  | 92 | 48 | } | 
|  | 93 | 49 | } | 
|  | 94 | 50 | } | 
| @@ -69,9 +69,16 namespace Implab.Automaton { | |||
|  | 69 | 69 | |
|  | 70 | 70 | |
|  | 71 | 71 | public IEnumerable<T> GetSymbols(int cls) { | 
|  | 72 | Safe.ArgumentAssert(cls > 0, "cls"); | |
|  | 72 | 73 | return m_map.Where(p => p.Value == cls).Select(p => p.Key); | 
|  | 73 | 74 | } | 
|  | 74 | 75 | #endregion | 
|  | 76 | ||
|  | 77 | public IEnumerable<KeyValuePair<T,int>> Mappings { | |
|  | 78 | get { | |
|  | 79 | return m_map; | |
|  | 80 | } | |
|  | 81 | } | |
|  | 75 | 82 | } | 
|  | 76 | 83 | } | 
|  | 77 | 84 | |
| @@ -66,9 +66,9 namespace Implab.Automaton.RegularExpres | |||
|  | 66 | 66 | return Token<TTag>.New( Enumerable.Range(0, AlphabetBuilder.Count).Except(TranslateOrDie(symbols)).ToArray() ); | 
|  | 67 | 67 | } | 
|  | 68 | 68 | |
|  | 69 | protected abstract IAlphabetB | |
|  | 69 | protected abstract IndexedAlphabetBase<TSymbol> CreateAlphabet(); | |
|  | 70 | 70 | |
|  | 71 | protected | |
|  | 71 | protected ScannerContext<TTag> BuildScannerContext(Token<TTag> regexp) { | |
|  | 72 | 72 | |
|  | 73 | 73 | var dfa = new RegularDFA<TSymbol, TTag>(AlphabetBuilder); | 
|  | 74 | 74 | |
| @@ -80,7 +80,16 namespace Implab.Automaton.RegularExpres | |||
|  | 80 | 80 | if (dfa.IsFinalState(dfa.InitialState)) | 
|  | 81 | 81 | throw new ApplicationException("The specified language contains empty token"); | 
|  | 82 | 82 | |
|  | 83 | r | |
|  | 83 | var ab = CreateAlphabet(); | |
|  | 84 | var optimal = dfa.Optimize(ab); | |
|  | 85 | ||
|  | 86 | return new ScannerContext<TTag>( | |
|  | 87 | optimal.CreateTransitionTable(), | |
|  | 88 | optimal.CreateFinalStateTable(), | |
|  | 89 | optimal.CreateTagTable(), | |
|  | 90 | optimal.InitialState, | |
|  | 91 | ab.GetTranslationMap() | |
|  | 92 | ); | |
|  | 84 | 93 | } | 
|  | 85 | 94 | |
|  | 86 | 95 | } | 
| @@ -36,16 +36,11 namespace Implab.Automaton.RegularExpres | |||
|  | 36 | 36 | return m_tags.TryGetValue(s, out tags) ? tags : new TTag[0]; | 
|  | 37 | 37 | } | 
|  | 38 | 38 | |
|  | 39 | public | |
|  | 40 | var table = new | |
|  | 39 | public TTag[][] CreateTagTable() { | |
|  | 40 | var table = new TTag[StateCount][]; | |
|  | 41 | 41 | |
|  | 42 | foreach (var | |
|  | 43 | if (table[t.s1].transitions == null) | |
|  | 44 | table[t.s1] = new DFAStateDescriptor<TTag>(AlphabetSize, IsFinalState(t.s1), GetStateTag(t.s1)); | |
|  | 45 | if (table[t.s2].transitions == null) | |
|  | 46 | table[t.s2] = new DFAStateDescriptor<TTag>(AlphabetSize, IsFinalState(t.s2), GetStateTag(t.s2)); | |
|  | 47 | table[t.s1].transitions[t.edge] = t.s2; | |
|  | 48 | } | |
|  | 42 | foreach (var pair in m_tags) | |
|  | 43 | table[pair.Key] = pair.Value; | |
|  | 49 | 44 | |
|  | 50 | 45 | return table; | 
|  | 51 | 46 | } | 
| @@ -4,7 +4,7 using Implab.Automaton; | |||
|  | 4 | 4 | |
|  | 5 | 5 | namespace Implab.Formats { | 
|  | 6 | 6 | public class ByteAlphabet : IndexedAlphabetBase<byte> { | 
|  | 7 | public ByteAlphabet() | |
|  | 7 | public ByteAlphabet() { | |
|  | 8 | 8 | } | 
|  | 9 | 9 | |
|  | 10 | 10 | #region implemented abstract members of IndexedAlphabetBase | 
| @@ -13,10 +13,6 namespace Implab.Formats { | |||
|  | 13 | 13 | return (int)symbol; | 
|  | 14 | 14 | } | 
|  | 15 | 15 | |
|  | 16 | public override byte GetSymbolByIndex(int index) { | |
|  | 17 | return (byte)index; | |
|  | 18 | } | |
|  | 19 | ||
|  | 20 | 16 | public IEnumerable<byte> InputSymbols { | 
|  | 21 | 17 | get { | 
|  | 22 | 18 | return Enumerable.Range(byte.MinValue, byte.MaxValue).Cast<byte>(); | 
| @@ -5,19 +5,14 using Implab.Automaton; | |||
|  | 5 | 5 | namespace Implab.Formats { | 
|  | 6 | 6 | public class CharAlphabet: IndexedAlphabetBase<char> { | 
|  | 7 | 7 | |
|  | 8 | public CharAlphabet() | |
|  | 9 | : base(char.MaxValue + 1) { | |
|  | 8 | public CharAlphabet() { | |
|  | 10 | 9 | } | 
|  | 11 | 10 | |
|  | 12 | 11 | public override int GetSymbolIndex(char symbol) { | 
|  | 13 | 12 | return symbol; | 
|  | 14 | 13 | } | 
|  | 15 | 14 | |
|  | 16 | public | |
|  | 17 | return (char)index; | |
|  | 18 | } | |
|  | 19 | ||
|  | 20 | public override IEnumerable<char> InputSymbols { | |
|  | 15 | public IEnumerable<char> InputSymbols { | |
|  | 21 | 16 | get { return Enumerable.Range(char.MinValue, char.MaxValue).Cast<char>(); } | 
|  | 22 | 17 | } | 
|  | 23 | 18 | } | 
| @@ -20,14 +20,7 namespace Implab.Formats.JSON { | |||
|  | 20 | 20 | StringBound, | 
|  | 21 | 21 | EscapedChar, | 
|  | 22 | 22 | UnescapedChar, | 
|  | 23 | EscapedUnicode | |
|  | 24 | ||
|  | 25 | Minus, | |
|  | 26 | Plus, | |
|  | 27 | Sign, | |
|  | 28 | Integer, | |
|  | 29 | Dot, | |
|  | 30 | Exp | |
|  | 23 | EscapedUnicode | |
|  | 31 | 24 | } | 
|  | 32 | 25 | |
|  | 33 | 26 | static Lazy<JSONGrammar> _instance = new Lazy<JSONGrammar>(); | 
| @@ -36,8 +29,8 namespace Implab.Formats.JSON { | |||
|  | 36 | 29 | get { return _instance.Value; } | 
|  | 37 | 30 | } | 
|  | 38 | 31 | |
|  | 39 | readonly | |
|  | 40 | readonly | |
|  | 32 | readonly ScannerContext<TokenType> m_jsonDFA; | |
|  | 33 | readonly ScannerContext<TokenType> m_stringDFA; | |
|  | 41 | 34 | |
|  | 42 | 35 | public JSONGrammar() { | 
|  | 43 | 36 | DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x)); | 
| @@ -88,17 +81,17 namespace Implab.Formats.JSON { | |||
|  | 88 | 81 | .Or(unescaped.Closure().Tag(TokenType.UnescapedChar)); | 
|  | 89 | 82 | |
|  | 90 | 83 | |
|  | 91 | m_jsonDFA = Build | |
|  | 92 | m_stringDFA = Build | |
|  | 84 | m_jsonDFA = BuildScannerContext(jsonExpression); | |
|  | 85 | m_stringDFA = BuildScannerContext(jsonStringExpression); | |
|  | 93 | 86 | } | 
|  | 94 | 87 | |
|  | 95 | public | |
|  | 88 | public ScannerContext<TokenType> JsonDFA { | |
|  | 96 | 89 | get { | 
|  | 97 | 90 | return m_jsonDFA; | 
|  | 98 | 91 | } | 
|  | 99 | 92 | } | 
|  | 100 | 93 | |
|  | 101 | public | |
|  | 94 | public ScannerContext<TokenType> JsonStringDFA { | |
|  | 102 | 95 | get { | 
|  | 103 | 96 | return m_stringDFA; | 
|  | 104 | 97 | } | 
| @@ -1,25 +1,37 | |||
|  | 1 | 1 | using System; | 
|  | 2 | 2 | using System.Globalization; | 
|  | 3 | 3 | using Implab.Automaton; | 
|  | 4 | using System.Text; | |
|  | 5 | using Implab.Components; | |
|  | 6 | using System.IO; | |
|  | 7 | using Implab.Automaton.RegularExpressions; | |
|  | 4 | 8 | |
|  | 5 | 9 | namespace Implab.Formats.JSON { | 
|  | 6 | 10 | /// <summary> | 
|  | 7 | 11 | /// Сканнер (лексер), разбивающий поток символов на токены JSON. | 
|  | 8 | 12 | /// </summary> | 
|  | 9 | public class JSONScanner : | |
|  | 10 | char[] m_stringBuffer; | |
|  | 11 | DFAStateDescriptior<>[] m_stringDFA; | |
|  | 12 | int[] m_stringAlphabet; | |
|  | 13 | public class JSONScanner : Disposable { | |
|  | 14 | readonly StringBuilder m_builder = new StringBuilder(); | |
|  | 15 | ||
|  | 16 | readonly ScannerContext<JSONGrammar.TokenType> m_jsonScanner = JSONGrammar.Instance.JsonDFA; | |
|  | 17 | readonly ScannerContext<JSONGrammar.TokenType> m_stringScanner = JSONGrammar.Instance.JsonStringDFA; | |
|  | 18 | ||
|  | 19 | ||
|  | 20 | readonly TextScanner m_scanner; | |
|  | 13 | 21 | |
|  | 14 | 22 | /// <summary> | 
|  | 15 | 23 | /// Создает новый экземпляр сканнера | 
|  | 16 | 24 | /// </summary> | 
|  | 17 | public JSONScanner() | |
|  | 18 | : base(JSONGrammar.Instance.JsonDFA.GetTransitionTable(), JSONGrammar.Instance.JsonDFA.Alphabet.GetTranslationMap()) { | |
|  | 19 | m_stringBuffer = new char[1024]; | |
|  | 20 | var dfa = JSONGrammar.Instance.JsonStringDFA; | |
|  | 21 | m_stringAlphabet = dfa.Alphabet.GetTranslationMap(); | |
|  | 22 | m_stringDFA = dfa.States; | |
|  | 25 | public JSONScanner(string text) { | |
|  | 26 | Safe.ArgumentNotEmpty(text, "text"); | |
|  | 27 | ||
|  | 28 | m_scanner = new StringScanner(text); | |
|  | 29 | } | |
|  | 30 | ||
|  | 31 | public JSONScanner(TextReader reader, int bufferMax, int chunkSize) { | |
|  | 32 | Safe.ArgumentNotNull(reader, "reader"); | |
|  | 33 | ||
|  | 34 | m_scanner = new ReaderScanner(reader); | |
|  | 23 | 35 | } | 
|  | 24 | 36 | |
|  | 25 | 37 | /// <summary> | 
| @@ -31,19 +43,20 namespace Implab.Formats.JSON { | |||
|  | 31 | 43 | /// <remarks>В случе если токен не распознается, возникает исключение. Значения токенов обрабатываются, т.е. | 
|  | 32 | 44 | /// в строках обрабатываются экранированные символы, числа становтся типа double.</remarks> | 
|  | 33 | 45 | public bool ReadToken(out object tokenValue, out JsonTokenType tokenType) { | 
|  | 34 | if (ReadTokenInternal()) { | |
|  | 35 | switch ((JSONGrammar.TokenType)m_currentState.tag[0]) { | |
|  | 46 | JSONGrammar.TokenType[] tag; | |
|  | 47 | if (m_jsonScanner.Execute(m_scanner, out tag)) { | |
|  | 48 | switch (tag[0]) { | |
|  | 36 | 49 | case JSONGrammar.TokenType.StringBound: | 
|  | 37 | 50 | tokenValue = ReadString(); | 
|  | 38 | 51 | tokenType = JsonTokenType.String; | 
|  | 39 | 52 | break; | 
|  | 40 | 53 | case JSONGrammar.TokenType.Number: | 
|  | 41 | tokenValue = Double.Parse( | |
|  | 54 | tokenValue = Double.Parse(m_scanner.GetTokenValue(), CultureInfo.InvariantCulture); | |
|  | 42 | 55 | tokenType = JsonTokenType.Number; | 
|  | 43 | 56 | break; | 
|  | 44 | 57 | default: | 
|  | 45 | tokenType = (JsonTokenType) | |
|  | 46 | tokenValue = | |
|  | 58 | tokenType = (JsonTokenType)tag[0]; | |
|  | 59 | tokenValue = m_scanner.GetTokenValue(); | |
|  | 47 | 60 | break; | 
|  | 48 | 61 | } | 
|  | 49 | 62 | return true; | 
| @@ -55,26 +68,26 namespace Implab.Formats.JSON { | |||
|  | 55 | 68 | |
|  | 56 | 69 | string ReadString() { | 
|  | 57 | 70 | int pos = 0; | 
|  | 58 | Switch(m_stringDFA, m_stringAlphabet); | |
|  | 59 | while (ReadTokenInternal()) { | |
|  | 60 |  | |
|  | 71 | char[] buf = new char[6]; // the buffer for unescaping chars | |
|  | 72 | ||
|  | 73 | JSONGrammar.TokenType[] tag; | |
|  | 74 | m_builder.Clear(); | |
|  | 75 | ||
|  | 76 | while (m_stringScanner.Execute(m_scanner, out tag)) { | |
|  | 77 | switch (tag[0]) { | |
|  | 61 | 78 | case JSONGrammar.TokenType.StringBound: | 
|  | 62 |  | |
|  | 63 | return new String(m_stringBuffer, 0, pos); | |
|  | 79 | return m_builder.ToString(); | |
|  | 64 | 80 | case JSONGrammar.TokenType.UnescapedChar: | 
|  | 65 | EnsureStringBufferSize(pos + m_tokenLen); | |
|  | 66 | Array.Copy(m_buffer, m_tokenOffset, m_stringBuffer, pos, m_tokenLen); | |
|  | 67 | pos += m_tokenLen; | |
|  | 81 | m_scanner.CopyTokenTo(m_builder); | |
|  | 68 | 82 | break; | 
|  | 69 | case JSONGrammar.TokenType.EscapedUnicode: | |
|  | 70 | EnsureStringBufferSize(pos + 1); | |
|  | 71 | m_ | |
|  | 83 | case JSONGrammar.TokenType.EscapedUnicode: // \xXXXX - unicode escape sequence | |
|  | 84 | m_scanner.CopyTokenTo(buf, 0); | |
|  | 85 | m_builder.Append(StringTranslator.TranslateHexUnicode(buf, 2)); | |
|  | 72 | 86 | pos++; | 
|  | 73 | 87 | break; | 
|  | 74 | case JSONGrammar.TokenType.EscapedChar: | |
|  | 75 |  | |
|  | 76 | m_ | |
|  | 77 | pos++; | |
|  | 88 | case JSONGrammar.TokenType.EscapedChar: // \t - escape sequence | |
|  | 89 | m_scanner.CopyTokenTo(buf, 0); | |
|  | 90 | m_builder.Append(StringTranslator.TranslateEscapedChar(buf[1])); | |
|  | 78 | 91 | break; | 
|  | 79 | 92 | default: | 
|  | 80 | 93 | break; | 
| @@ -84,13 +97,5 namespace Implab.Formats.JSON { | |||
|  | 84 | 97 | |
|  | 85 | 98 | throw new ParserException("Unexpected end of data"); | 
|  | 86 | 99 | } | 
|  | 87 | ||
|  | 88 | void EnsureStringBufferSize(int size) { | |
|  | 89 | if (size > m_stringBuffer.Length) { | |
|  | 90 | var newBuffer = new char[size]; | |
|  | 91 | m_stringBuffer.CopyTo(newBuffer, 0); | |
|  | 92 | m_stringBuffer = newBuffer; | |
|  | 93 | } | |
|  | 94 | } | |
|  | 95 | 100 | } | 
|  | 96 | 101 | } | 
| @@ -1,5 +1,5 | |||
|  | 1 | 1 | using Implab; | 
|  | 2 | using Implab. | |
|  | 2 | using Implab.Formats; | |
|  | 3 | 3 | using System; | 
|  | 4 | 4 | using System.Collections.Generic; | 
|  | 5 | 5 | using System.Diagnostics; | 
| @@ -7,11 +7,11 using System.Linq; | |||
|  | 7 | 7 | using System.Text; | 
|  | 8 | 8 | using System.Threading.Tasks; | 
|  | 9 | 9 | |
|  | 10 | namespace Implab.JSON { | |
|  | 10 | namespace Implab.Formats.JSON { | |
|  | 11 | 11 | /// <summary> | 
|  | 12 | 12 | /// Класс для преобразования экранированной строки JSON | 
|  | 13 | 13 | /// </summary> | 
|  | 14 | public class StringTranslator : Scanner { | |
|  | 14 | public class StringTranslator : TextScanner<JSONGrammar.TokenType> { | |
|  | 15 | 15 | static readonly char[] _escMap; | 
|  | 16 | 16 | static readonly int[] _hexMap; | 
|  | 17 | 17 | |
| @@ -34,8 +34,7 namespace Implab.JSON { | |||
|  | 34 | 34 | |
|  | 35 | 35 | } | 
|  | 36 | 36 | |
|  | 37 | public StringTranslator() | |
|  | 38 | : base(JSONGrammar.Instance.JsonStringDFA.States, JSONGrammar.Instance.JsonStringDFA.Alphabet.GetTranslationMap()) { | |
|  | 37 | public StringTranslator() { | |
|  | 39 | 38 | } | 
|  | 40 | 39 | |
|  | 41 | 40 | public string Translate(string data) { | 
| @@ -59,7 +58,7 namespace Implab.JSON { | |||
|  | 59 | 58 | int pos = 0; | 
|  | 60 | 59 | |
|  | 61 | 60 | while (ReadTokenInternal()) { | 
|  | 62 | switch ((JSONGrammar.TokenType) | |
|  | 61 | switch ((JSONGrammar.TokenType)Tags[0]) { | |
|  | 63 | 62 | case JSONGrammar.TokenType.UnescapedChar: | 
|  | 64 | 63 | Array.Copy(m_buffer,m_tokenOffset,translated,pos,m_tokenLen); | 
|  | 65 | 64 | pos += m_tokenLen; | 
| @@ -3,50 +3,146 using Implab.Components; | |||
|  | 3 | 3 | using Implab.Automaton.RegularExpressions; | 
|  | 4 | 4 | using System.Diagnostics; | 
|  | 5 | 5 | using Implab.Automaton; | 
|  | 6 | using System.IO; | |
|  | 7 | using System.Text; | |
|  | 6 | 8 | |
|  | 7 | 9 | namespace Implab.Formats { | 
|  | 8 | public abstract class TextScanner | |
|  | 10 | public abstract class TextScanner : Disposable { | |
|  | 11 | readonly int m_bufferMax; | |
|  | 12 | readonly int m_chunkSize; | |
|  | 9 | 13 | |
|  | 10 | int m_maxSymbol; | |
|  | 11 | int[] m_symbolMap; | |
|  | 12 | ||
|  | 13 | readonly char[] m_buffer; | |
|  | 14 | char[] m_buffer; | |
|  | 14 | 15 | int m_bufferOffset; | 
|  | 15 | 16 | int m_bufferSize; | 
|  | 17 | int m_tokenOffset; | |
|  | 16 | 18 | int m_tokenLength; | 
|  | 17 | 19 | |
|  | 18 | TTag[] m_tags; | |
|  | 20 | /// <summary> | |
|  | 21 | /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class. | |
|  | 22 | /// </summary> | |
|  | 23 | /// <param name="bufferMax">Buffer max.</param> | |
|  | 24 | /// <param name="chunkSize">Chunk size.</param> | |
|  | 25 | protected TextScanner(int bufferMax, int chunkSize) { | |
|  | 26 | Debug.Assert(m_chunkSize <= m_bufferMax); | |
|  | 27 | ||
|  | 28 | m_bufferMax = bufferMax; | |
|  | 29 | m_chunkSize = chunkSize; | |
|  | 30 | } | |
|  | 19 | 31 | |
|  | 20 | protected bool ReadTokenInternal(DFAStateDescriptor<TTag>[] dfa, int state) { | |
|  | 21 | Debug.Assert(dfa != null); | |
|  | 32 | /// <summary> | |
|  | 33 | /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class. | |
|  | 34 | /// </summary> | |
|  | 35 | /// <param name="buffer">Buffer.</param> | |
|  | 36 | protected TextScanner(char[] buffer) { | |
|  | 37 | if (buffer != null) { | |
|  | 38 | m_buffer = buffer; | |
|  | 39 | m_bufferSize = buffer.Length; | |
|  | 40 | } | |
|  | 41 | } | |
|  | 42 | ||
|  | 43 | /// <summary> | |
|  | 44 | /// (hungry) Reads the next token. | |
|  | 45 | /// </summary> | |
|  | 46 | /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns> | |
|  | 47 | /// <param name="dfa">The transition map for the automaton</param> | |
|  | 48 | /// <param name="final">Final states of the automaton.</param> | |
|  | 49 | /// <param name="tags">Tags.</param> | |
|  | 50 | /// <param name="state">The initial state for the automaton.</param> | |
|  | 51 | internal bool ReadToken<TTag>(int[,] dfa, int[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) { | |
|  | 52 | Safe.ArgumentNotNull(); | |
|  | 53 | m_tokenLength = 0; | |
|  | 54 | ||
|  | 55 | var maxSymbol = alphabet.Length - 1; | |
|  | 22 | 56 | |
|  | 23 | 57 | do { | 
|  | 24 | for (var pos = m_bufferOffset; pos < m_bufferSize; pos++) { | |
|  | 58 | // after the next chunk is read the offset in the buffer may change | |
|  | 59 | int pos = m_bufferOffset + m_tokenLength; | |
|  | 60 | ||
|  | 61 | while(pos < m_bufferSize) { | |
|  | 25 | 62 | var ch = m_buffer[pos]; | 
|  | 26 | state = dfa[state].transitions[m_symbolMap[ch > m_maxSymbol ? m_maxSymbol : ch]]; | |
|  | 63 | ||
|  | 64 | state = dfa[state,ch > maxSymbol ? DFAConst.UNCLASSIFIED_INPUT : alphabet[ch]]; | |
|  | 27 | 65 | if (state == DFAConst.UNREACHABLE_STATE) | 
|  | 28 | 66 | break; | 
|  | 67 | ||
|  | 68 | pos++; | |
|  | 29 | 69 | } | 
|  | 30 | } while (Feed()); | |
|  | 70 | ||
|  | 71 | m_tokenLength = pos - m_bufferOffset; | |
|  | 72 | } while (state != DFAConst.UNREACHABLE_STATE && Feed()); | |
|  | 73 | ||
|  | 74 | m_tokenOffset = m_bufferOffset; | |
|  | 75 | m_bufferOffset += m_tokenLength; | |
|  | 31 | 76 | |
|  | 32 | if ( | |
|  | 77 | if (final[state]) { | |
|  | 78 | tag = tags[state]; | |
|  | 79 | return true; | |
|  | 80 | } else { | |
|  | 81 | if (m_bufferOffset == m_bufferSize) { | |
|  | 82 | if (m_tokenLength == 0) //EOF | |
|  | 83 | return false; | |
|  | 84 | ||
|  | 85 | throw new ParserException(); | |
|  | 86 | } | |
|  | 87 | throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset])); | |
|  | 88 | ||
|  | 89 | } | |
|  | 90 | } | |
|  | 33 | 91 | |
|  | 34 | } | |
|  | 35 | ||
|  | 92 | protected void Feed(char[] buffer, int offset, int length) { | |
|  | 93 | m_buffer = buffer; | |
|  | 94 | m_bufferOffset = offset; | |
|  | 95 | m_bufferSize = offset + length; | |
|  | 36 | 96 | } | 
|  | 37 | 97 | |
|  | 38 | bool Feed() { | |
|  | 98 | protected bool Feed() { | |
|  | 99 | if (m_chunkSize <= 0) | |
|  | 100 | return false; | |
|  | 101 | ||
|  | 102 | if (m_buffer != null) { | |
|  | 103 | var free = m_buffer.Length - m_bufferSize; | |
|  | 104 | ||
|  | 105 | if (free < m_chunkSize) { | |
|  | 106 | free += m_chunkSize; | |
|  | 107 | var used = m_bufferSize - m_bufferOffset; | |
|  | 108 | var size = used + free; | |
|  | 109 | ||
|  | 110 | if (size > m_bufferMax) | |
|  | 111 | throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached"), m_bufferMax/1024); | |
|  | 112 | ||
|  | 113 | var temp = new char[size]; | |
|  | 39 | 114 | |
|  | 115 | var read = Read(temp, used, m_chunkSize); | |
|  | 116 | if (read == 0) | |
|  | 117 | return false; | |
|  | 118 | ||
|  | 119 | Array.Copy(m_buffer, m_bufferOffset, temp, 0, used); | |
|  | 120 | ||
|  | 121 | m_bufferOffset = 0; | |
|  | 122 | m_bufferSize = used + read; | |
|  | 123 | m_buffer = temp; | |
|  | 124 | } | |
|  | 125 | } else { | |
|  | 126 | Debug.Assert(m_bufferOffset == 0); | |
|  | 127 | m_buffer = new char[m_chunkSize]; | |
|  | 128 | m_bufferSize = Read(m_buffer, 0, m_chunkSize); | |
|  | 129 | return (m_bufferSize != 0); | |
|  | 130 | } | |
|  | 40 | 131 | } | 
|  | 41 | 132 | |
|  | 42 | 133 | protected abstract int Read(char[] buffer, int offset, int size); | 
|  | 43 | 134 | |
|  | 44 | protected TTag[] Tags { | |
|  | 45 | get { | |
|  | 46 | return m_tags; | |
|  | 47 | } | |
|  | 135 | public string GetTokenValue() { | |
|  | 136 | return new String(m_buffer, m_tokenOffset, m_tokenLength); | |
|  | 48 | 137 | } | 
|  | 49 | 138 | |
|  | 139 | public void CopyTokenTo(char[] buffer, int offset) { | |
|  | 140 | m_buffer.CopyTo(buffer, offset); | |
|  | 141 | } | |
|  | 142 | ||
|  | 143 | public void CopyTokenTo(StringBuilder sb) { | |
|  | 144 | sb.Append(m_buffer, m_tokenOffset, m_tokenLength); | |
|  | 145 | } | |
|  | 50 | 146 | |
|  | 51 | 147 | } | 
|  | 52 | 148 | } | 
| @@ -151,11 +151,9 | |||
|  | 151 | 151 | <Compile Include="Components\ExecutionState.cs" /> | 
|  | 152 | 152 | <Compile Include="Components\RunnableComponent.cs" /> | 
|  | 153 | 153 | <Compile Include="Components\IFactory.cs" /> | 
|  | 154 | <Compile Include="Automaton\DFAStateDescriptor.cs" /> | |
|  | 155 | 154 | <Compile Include="Automaton\EnumAlphabet.cs" /> | 
|  | 156 | 155 | <Compile Include="Automaton\IAlphabet.cs" /> | 
|  | 157 | 156 | <Compile Include="Automaton\ParserException.cs" /> | 
|  | 158 | <Compile Include="Automaton\Scanner.cs" /> | |
|  | 159 | 157 | <Compile Include="Automaton\IndexedAlphabetBase.cs" /> | 
|  | 160 | 158 | <Compile Include="Automaton\IAlphabetBuilder.cs" /> | 
|  | 161 | 159 | <Compile Include="Automaton\RegularExpressions\AltToken.cs" /> | 
| @@ -190,9 +188,10 | |||
|  | 190 | 188 | <Compile Include="Automaton\RegularExpressions\RegularDFA.cs" /> | 
|  | 191 | 189 | <Compile Include="Automaton\RegularExpressions\RegularExpressionVisitor.cs" /> | 
|  | 192 | 190 | <Compile Include="Automaton\RegularExpressions\ITaggedDFABuilder.cs" /> | 
|  | 193 | <Compile Include="Automaton\RegularExpressions\DFAStateDescriptorT.cs" /> | |
|  | 194 | <Compile Include="Formats\BufferScanner.cs" /> | |
|  | 195 | 191 | <Compile Include="Formats\TextScanner.cs" /> | 
|  | 192 | <Compile Include="Formats\StringScanner.cs" /> | |
|  | 193 | <Compile Include="Formats\ReaderScanner.cs" /> | |
|  | 194 | <Compile Include="Formats\ScannerContext.cs" /> | |
|  | 196 | 195 | </ItemGroup> | 
|  | 197 | 196 | <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" /> | 
|  | 198 | 197 | <ItemGroup /> | 
|  | 1 | NO CONTENT: file was removed | 
|  | 1 | NO CONTENT: file was removed | 
|  | 1 | NO CONTENT: file was removed | 
|  | 1 | NO CONTENT: file was removed | 
        
        General Comments 0
    
    
  
  
                      You need to be logged in to leave comments.
                      Login now
                    
                