##// END OF EJS Templates
rewritten the text scanner
cin -
r176:0c3c69fe225b ref20160224
parent child
Show More
@@ -0,0 +1,30
1 using System;
2 using System.IO;
3
4 namespace Implab.Formats {
5 public class ReaderScanner: TextScanner {
6 const int CHUNK_SIZE = 1024;
7 const int BUFFER_MAX = CHUNK_SIZE*1024;
8
9 readonly TextReader m_reader;
10
11 public ReaderScanner(TextReader reader, int limit, int chunk) : base(limit, chunk) {
12 Safe.ArgumentNotNull(reader, "reader");
13 m_reader = reader;
14 }
15
16 public ReaderScanner(TextReader reader) : this(reader, BUFFER_MAX, CHUNK_SIZE) {
17 }
18
19 protected override int Read(char[] buffer, int offset, int size) {
20 return m_reader.Read(buffer, offset, size);
21 }
22
23 protected override void Dispose(bool disposing) {
24 if (disposing)
25 Safe.Dispose(m_reader);
26 base.Dispose(disposing);
27 }
28 }
29 }
30
@@ -0,0 +1,24
1 using System;
2
3 namespace Implab.Formats {
4 public class ScannerContext<TTag> {
5 public int[,] Dfa { get; private set; }
6 public bool[] Final { get; private set; }
7 public TTag[][] Tags { get; private set; }
8 public int State { get; private set; }
9 public int[] Alphabet { get; private set; }
10
11 public ScannerContext(int[,] dfa, bool[] final, TTag[][] tags, int state, int[] alphabet) {
12 Dfa = dfa;
13 Final = final;
14 Tags = tags;
15 State = state;
16 Alphabet = alphabet;
17 }
18
19 public bool Execute(TextScanner scanner, out TTag[] tag) {
20 return scanner.ReadToken(Dfa, Final, Tags, State, Alphabet, out tag);
21 }
22 }
23 }
24
@@ -0,0 +1,26
1 using System;
2
3 namespace Implab.Formats {
4 public class StringScanner: TextScanner {
5 const int CHUNK_SIZE = 1024;
6
7 readonly string m_text;
8 int m_pos;
9
10 public StringScanner(string text) : base(text.Length, text.Length < CHUNK_SIZE ? text.Length : CHUNK_SIZE) {
11 m_text = text;
12 Feed();
13 }
14
15 protected override int Read(char[] buffer, int offset, int size) {
16 var actual = size + m_pos > m_text.Length ? m_text.Length - m_pos : size;
17
18 m_text.CopyTo(m_pos,buffer,offset, actual);
19
20 m_pos += actual;
21
22 return actual;
23 }
24 }
25 }
26
@@ -100,17 +100,25 namespace Implab.Automaton {
100 return GetEnumerator();
100 return GetEnumerator();
101 }
101 }
102
102
103 public DFAStateDescriptor[] CreateTransitionTable() {
103 public int[,] CreateTransitionTable() {
104 var table = new DFAStateDescriptor[StateCount];
104 var table = new int[StateCount,AlphabetSize];
105
106 for (int i = 0; i < StateCount; i++)
107 for (int j = 0; i < AlphabetSize; j++)
108 table[i, j] = DFAConst.UNREACHABLE_STATE;
105
109
106 foreach (var t in this) {
110 foreach (var t in this)
107 if (table[t.s1].transitions == null)
111 table[t.s1,t.edge] = t.s2;
108 table[t.s1] = new DFAStateDescriptor(AlphabetSize, IsFinalState(t.s1));
112
109 if (table[t.s2].transitions == null)
113 return table;
110 table[t.s2] = new DFAStateDescriptor(AlphabetSize, IsFinalState(t.s2));
111 table[t.s1].transitions[t.edge] = t.s2;
112 }
114 }
113
115
116 public bool[] CreateFinalStateTable() {
117 var table = new bool[StateCount];
118
119 foreach (var s in FinalStates)
120 table[s] = true;
121
114 return table;
122 return table;
115 }
123 }
116
124
@@ -13,82 +13,38 namespace Implab.Automaton {
13 /// to the input alphabet of the automaton. It's assumed that the index to the symbol match
13 /// to the input alphabet of the automaton. It's assumed that the index to the symbol match
14 /// is well known and documented.
14 /// is well known and documented.
15 /// </remarks>
15 /// </remarks>
16 public abstract class IndexedAlphabetBase<T> : IAlphabetBuilder<T> {
16 public abstract class IndexedAlphabetBase<T> : MapAlphabet<T> {
17 int m_nextId = 1;
18 readonly int[] m_map;
19
20 protected IndexedAlphabetBase(int mapSize) {
21 m_map = new int[mapSize];
22 }
23
24 protected IndexedAlphabetBase(int[] map) {
25 Debug.Assert(map != null && map.Length > 0);
26 Debug.Assert(map.All(x => x >= 0));
27
28 m_map = map;
29 m_nextId = map.Max() + 1;
30 }
31
32 public int DefineSymbol(T symbol) {
33 var index = GetSymbolIndex(symbol);
34 if (m_map[index] == DFAConst.UNCLASSIFIED_INPUT)
35 m_map[index] = m_nextId++;
36 return m_map[index];
37 }
38
39 public int DefineSymbol(T symbol, int cls) {
40 var index = GetSymbolIndex(symbol);
41 m_map[index] = cls;
42 m_nextId = Math.Max(cls + 1, m_nextId);
43 return cls;
44 }
45
17
46 public int DefineClass(IEnumerable<T> symbols) {
18 protected IndexedAlphabetBase() :base(true, null) {
47 return DefineClass(symbols, m_nextId);
48 }
49
50 public int DefineClass(IEnumerable<T> symbols, int cls) {
51 Safe.ArgumentNotNull(symbols, "symbols");
52 symbols = symbols.Distinct();
53
54 foreach (var symbol in symbols)
55 m_map[GetSymbolIndex(symbol)] = cls;
56
57 m_nextId = Math.Max(cls + 1, m_nextId);
58
59 return cls;
60 }
61
62 public virtual int Translate(T symbol) {
63 return m_map[GetSymbolIndex(symbol)];
64 }
65
66 public int Count {
67 get { return m_nextId; }
68 }
69
70 public bool Contains(T symbol) {
71 return true;
72 }
73
74 public IEnumerable<T> GetSymbols(int cls) {
75 for (var i = 0; i < m_map.Length; i++)
76 if (m_map[i] == cls)
77 yield return GetSymbolByIndex(i);
78 }
19 }
79
20
80 public abstract int GetSymbolIndex(T symbol);
21 public abstract int GetSymbolIndex(T symbol);
81
22
82 public abstract T GetSymbolByIndex(int index);
83
84 public abstract IEnumerable<T> InputSymbols { get; }
85
86 /// <summary>
23 /// <summary>
87 /// Gets the translation map from the index of the symbol to it's class this is usefull for the optimized input symbols transtaion.
24 /// Gets the translation map from the index of the symbol to it's class this is usefull for the optimized input symbols transtaion.
88 /// </summary>
25 /// </summary>
26 /// <remarks>
27 /// The map is continous and start from the symbol with zero code. The last symbol
28 /// in the map is the last classified symbol in the alphabet, i.e. the map can be
29 /// shorter then the whole alphabet.
30 /// </remarks>
89 /// <returns>The translation map.</returns>
31 /// <returns>The translation map.</returns>
90 public int[] GetTranslationMap() {
32 public int[] GetTranslationMap() {
91 return m_map;
33 Dictionary<int,int> map = new Dictionary<int, int>();
34
35 int max;
36 foreach (var p in Mappings) {
37 var index = GetSymbolIndex(p.Key);
38 max = Math.Max(max, index);
39 map[index] = p.Value;
40 }
41
42 var result = new int[max + 1];
43
44 for (int i = 0; i < result.Length; i++)
45 map.TryGetValue(i, out result[i]);
46
47 return result;
92 }
48 }
93 }
49 }
94 }
50 }
@@ -69,9 +69,16 namespace Implab.Automaton {
69
69
70
70
71 public IEnumerable<T> GetSymbols(int cls) {
71 public IEnumerable<T> GetSymbols(int cls) {
72 Safe.ArgumentAssert(cls > 0, "cls");
72 return m_map.Where(p => p.Value == cls).Select(p => p.Key);
73 return m_map.Where(p => p.Value == cls).Select(p => p.Key);
73 }
74 }
74 #endregion
75 #endregion
76
77 public IEnumerable<KeyValuePair<T,int>> Mappings {
78 get {
79 return m_map;
80 }
81 }
75 }
82 }
76 }
83 }
77
84
@@ -66,9 +66,9 namespace Implab.Automaton.RegularExpres
66 return Token<TTag>.New( Enumerable.Range(0, AlphabetBuilder.Count).Except(TranslateOrDie(symbols)).ToArray() );
66 return Token<TTag>.New( Enumerable.Range(0, AlphabetBuilder.Count).Except(TranslateOrDie(symbols)).ToArray() );
67 }
67 }
68
68
69 protected abstract IAlphabetBuilder<TSymbol> CreateAlphabet();
69 protected abstract IndexedAlphabetBase<TSymbol> CreateAlphabet();
70
70
71 protected RegularDFA<TSymbol, TTag> BuildDFA(Token<TTag> regexp) {
71 protected ScannerContext<TTag> BuildScannerContext(Token<TTag> regexp) {
72
72
73 var dfa = new RegularDFA<TSymbol, TTag>(AlphabetBuilder);
73 var dfa = new RegularDFA<TSymbol, TTag>(AlphabetBuilder);
74
74
@@ -80,7 +80,16 namespace Implab.Automaton.RegularExpres
80 if (dfa.IsFinalState(dfa.InitialState))
80 if (dfa.IsFinalState(dfa.InitialState))
81 throw new ApplicationException("The specified language contains empty token");
81 throw new ApplicationException("The specified language contains empty token");
82
82
83 return dfa.Optimize(CreateAlphabet());
83 var ab = CreateAlphabet();
84 var optimal = dfa.Optimize(ab);
85
86 return new ScannerContext<TTag>(
87 optimal.CreateTransitionTable(),
88 optimal.CreateFinalStateTable(),
89 optimal.CreateTagTable(),
90 optimal.InitialState,
91 ab.GetTranslationMap()
92 );
84 }
93 }
85
94
86 }
95 }
@@ -36,16 +36,11 namespace Implab.Automaton.RegularExpres
36 return m_tags.TryGetValue(s, out tags) ? tags : new TTag[0];
36 return m_tags.TryGetValue(s, out tags) ? tags : new TTag[0];
37 }
37 }
38
38
39 public new DFAStateDescriptor<TTag>[] CreateTransitionTable() {
39 public TTag[][] CreateTagTable() {
40 var table = new DFAStateDescriptor<TTag>[StateCount];
40 var table = new TTag[StateCount][];
41
41
42 foreach (var t in this) {
42 foreach (var pair in m_tags)
43 if (table[t.s1].transitions == null)
43 table[pair.Key] = pair.Value;
44 table[t.s1] = new DFAStateDescriptor<TTag>(AlphabetSize, IsFinalState(t.s1), GetStateTag(t.s1));
45 if (table[t.s2].transitions == null)
46 table[t.s2] = new DFAStateDescriptor<TTag>(AlphabetSize, IsFinalState(t.s2), GetStateTag(t.s2));
47 table[t.s1].transitions[t.edge] = t.s2;
48 }
49
44
50 return table;
45 return table;
51 }
46 }
@@ -4,7 +4,7 using Implab.Automaton;
4
4
5 namespace Implab.Formats {
5 namespace Implab.Formats {
6 public class ByteAlphabet : IndexedAlphabetBase<byte> {
6 public class ByteAlphabet : IndexedAlphabetBase<byte> {
7 public ByteAlphabet() : base(byte.MaxValue + 1){
7 public ByteAlphabet() {
8 }
8 }
9
9
10 #region implemented abstract members of IndexedAlphabetBase
10 #region implemented abstract members of IndexedAlphabetBase
@@ -13,10 +13,6 namespace Implab.Formats {
13 return (int)symbol;
13 return (int)symbol;
14 }
14 }
15
15
16 public override byte GetSymbolByIndex(int index) {
17 return (byte)index;
18 }
19
20 public IEnumerable<byte> InputSymbols {
16 public IEnumerable<byte> InputSymbols {
21 get {
17 get {
22 return Enumerable.Range(byte.MinValue, byte.MaxValue).Cast<byte>();
18 return Enumerable.Range(byte.MinValue, byte.MaxValue).Cast<byte>();
@@ -5,19 +5,14 using Implab.Automaton;
5 namespace Implab.Formats {
5 namespace Implab.Formats {
6 public class CharAlphabet: IndexedAlphabetBase<char> {
6 public class CharAlphabet: IndexedAlphabetBase<char> {
7
7
8 public CharAlphabet()
8 public CharAlphabet() {
9 : base(char.MaxValue + 1) {
10 }
9 }
11
10
12 public override int GetSymbolIndex(char symbol) {
11 public override int GetSymbolIndex(char symbol) {
13 return symbol;
12 return symbol;
14 }
13 }
15
14
16 public override char GetSymbolByIndex(int index) {
15 public IEnumerable<char> InputSymbols {
17 return (char)index;
18 }
19
20 public override IEnumerable<char> InputSymbols {
21 get { return Enumerable.Range(char.MinValue, char.MaxValue).Cast<char>(); }
16 get { return Enumerable.Range(char.MinValue, char.MaxValue).Cast<char>(); }
22 }
17 }
23 }
18 }
@@ -20,14 +20,7 namespace Implab.Formats.JSON {
20 StringBound,
20 StringBound,
21 EscapedChar,
21 EscapedChar,
22 UnescapedChar,
22 UnescapedChar,
23 EscapedUnicode,
23 EscapedUnicode
24
25 Minus,
26 Plus,
27 Sign,
28 Integer,
29 Dot,
30 Exp
31 }
24 }
32
25
33 static Lazy<JSONGrammar> _instance = new Lazy<JSONGrammar>();
26 static Lazy<JSONGrammar> _instance = new Lazy<JSONGrammar>();
@@ -36,8 +29,8 namespace Implab.Formats.JSON {
36 get { return _instance.Value; }
29 get { return _instance.Value; }
37 }
30 }
38
31
39 readonly RegularDFA<char, TokenType> m_jsonDFA;
32 readonly ScannerContext<TokenType> m_jsonDFA;
40 readonly RegularDFA<char, TokenType> m_stringDFA;
33 readonly ScannerContext<TokenType> m_stringDFA;
41
34
42 public JSONGrammar() {
35 public JSONGrammar() {
43 DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x));
36 DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x));
@@ -88,17 +81,17 namespace Implab.Formats.JSON {
88 .Or(unescaped.Closure().Tag(TokenType.UnescapedChar));
81 .Or(unescaped.Closure().Tag(TokenType.UnescapedChar));
89
82
90
83
91 m_jsonDFA = BuildDFA(jsonExpression);
84 m_jsonDFA = BuildScannerContext(jsonExpression);
92 m_stringDFA = BuildDFA(jsonStringExpression);
85 m_stringDFA = BuildScannerContext(jsonStringExpression);
93 }
86 }
94
87
95 public RegularDFA<char, TokenType> JsonDFA {
88 public ScannerContext<TokenType> JsonDFA {
96 get {
89 get {
97 return m_jsonDFA;
90 return m_jsonDFA;
98 }
91 }
99 }
92 }
100
93
101 public RegularDFA<char,TokenType> JsonStringDFA {
94 public ScannerContext<TokenType> JsonStringDFA {
102 get {
95 get {
103 return m_stringDFA;
96 return m_stringDFA;
104 }
97 }
@@ -1,25 +1,37
1 using System;
1 using System;
2 using System.Globalization;
2 using System.Globalization;
3 using Implab.Automaton;
3 using Implab.Automaton;
4 using System.Text;
5 using Implab.Components;
6 using System.IO;
7 using Implab.Automaton.RegularExpressions;
4
8
5 namespace Implab.Formats.JSON {
9 namespace Implab.Formats.JSON {
6 /// <summary>
10 /// <summary>
7 /// Сканнер (лексер), разбивающий поток символов на токены JSON.
11 /// Сканнер (лексер), разбивающий поток символов на токены JSON.
8 /// </summary>
12 /// </summary>
9 public class JSONScanner : Scanner<object> {
13 public class JSONScanner : Disposable {
10 char[] m_stringBuffer;
14 readonly StringBuilder m_builder = new StringBuilder();
11 DFAStateDescriptior<>[] m_stringDFA;
15
12 int[] m_stringAlphabet;
16 readonly ScannerContext<JSONGrammar.TokenType> m_jsonScanner = JSONGrammar.Instance.JsonDFA;
17 readonly ScannerContext<JSONGrammar.TokenType> m_stringScanner = JSONGrammar.Instance.JsonStringDFA;
18
19
20 readonly TextScanner m_scanner;
13
21
14 /// <summary>
22 /// <summary>
15 /// Создает новый экземпляр сканнера
23 /// Создает новый экземпляр сканнера
16 /// </summary>
24 /// </summary>
17 public JSONScanner()
25 public JSONScanner(string text) {
18 : base(JSONGrammar.Instance.JsonDFA.GetTransitionTable(), JSONGrammar.Instance.JsonDFA.Alphabet.GetTranslationMap()) {
26 Safe.ArgumentNotEmpty(text, "text");
19 m_stringBuffer = new char[1024];
27
20 var dfa = JSONGrammar.Instance.JsonStringDFA;
28 m_scanner = new StringScanner(text);
21 m_stringAlphabet = dfa.Alphabet.GetTranslationMap();
29 }
22 m_stringDFA = dfa.States;
30
31 public JSONScanner(TextReader reader, int bufferMax, int chunkSize) {
32 Safe.ArgumentNotNull(reader, "reader");
33
34 m_scanner = new ReaderScanner(reader);
23 }
35 }
24
36
25 /// <summary>
37 /// <summary>
@@ -31,19 +43,20 namespace Implab.Formats.JSON {
31 /// <remarks>В случе если токен не распознается, возникает исключение. Значения токенов обрабатываются, т.е.
43 /// <remarks>В случе если токен не распознается, возникает исключение. Значения токенов обрабатываются, т.е.
32 /// в строках обрабатываются экранированные символы, числа становтся типа double.</remarks>
44 /// в строках обрабатываются экранированные символы, числа становтся типа double.</remarks>
33 public bool ReadToken(out object tokenValue, out JsonTokenType tokenType) {
45 public bool ReadToken(out object tokenValue, out JsonTokenType tokenType) {
34 if (ReadTokenInternal()) {
46 JSONGrammar.TokenType[] tag;
35 switch ((JSONGrammar.TokenType)m_currentState.tag[0]) {
47 if (m_jsonScanner.Execute(m_scanner, out tag)) {
48 switch (tag[0]) {
36 case JSONGrammar.TokenType.StringBound:
49 case JSONGrammar.TokenType.StringBound:
37 tokenValue = ReadString();
50 tokenValue = ReadString();
38 tokenType = JsonTokenType.String;
51 tokenType = JsonTokenType.String;
39 break;
52 break;
40 case JSONGrammar.TokenType.Number:
53 case JSONGrammar.TokenType.Number:
41 tokenValue = Double.Parse(new String(m_buffer, m_tokenOffset, m_tokenLen), CultureInfo.InvariantCulture);
54 tokenValue = Double.Parse(m_scanner.GetTokenValue(), CultureInfo.InvariantCulture);
42 tokenType = JsonTokenType.Number;
55 tokenType = JsonTokenType.Number;
43 break;
56 break;
44 default:
57 default:
45 tokenType = (JsonTokenType)m_currentState.tag[0];
58 tokenType = (JsonTokenType)tag[0];
46 tokenValue = new String(m_buffer, m_tokenOffset, m_tokenLen);
59 tokenValue = m_scanner.GetTokenValue();
47 break;
60 break;
48 }
61 }
49 return true;
62 return true;
@@ -55,26 +68,26 namespace Implab.Formats.JSON {
55
68
56 string ReadString() {
69 string ReadString() {
57 int pos = 0;
70 int pos = 0;
58 Switch(m_stringDFA, m_stringAlphabet);
71 char[] buf = new char[6]; // the buffer for unescaping chars
59 while (ReadTokenInternal()) {
72
60 switch ((JSONGrammar.TokenType)m_currentState.tag[0]) {
73 JSONGrammar.TokenType[] tag;
74 m_builder.Clear();
75
76 while (m_stringScanner.Execute(m_scanner, out tag)) {
77 switch (tag[0]) {
61 case JSONGrammar.TokenType.StringBound:
78 case JSONGrammar.TokenType.StringBound:
62 Restore();
79 return m_builder.ToString();
63 return new String(m_stringBuffer, 0, pos);
64 case JSONGrammar.TokenType.UnescapedChar:
80 case JSONGrammar.TokenType.UnescapedChar:
65 EnsureStringBufferSize(pos + m_tokenLen);
81 m_scanner.CopyTokenTo(m_builder);
66 Array.Copy(m_buffer, m_tokenOffset, m_stringBuffer, pos, m_tokenLen);
67 pos += m_tokenLen;
68 break;
82 break;
69 case JSONGrammar.TokenType.EscapedUnicode:
83 case JSONGrammar.TokenType.EscapedUnicode: // \xXXXX - unicode escape sequence
70 EnsureStringBufferSize(pos + 1);
84 m_scanner.CopyTokenTo(buf, 0);
71 m_stringBuffer[pos] = StringTranslator.TranslateHexUnicode(m_buffer, m_tokenOffset + 2);
85 m_builder.Append(StringTranslator.TranslateHexUnicode(buf, 2));
72 pos++;
86 pos++;
73 break;
87 break;
74 case JSONGrammar.TokenType.EscapedChar:
88 case JSONGrammar.TokenType.EscapedChar: // \t - escape sequence
75 EnsureStringBufferSize(pos + 1);
89 m_scanner.CopyTokenTo(buf, 0);
76 m_stringBuffer[pos] = StringTranslator.TranslateEscapedChar(m_buffer[m_tokenOffset + 1]);
90 m_builder.Append(StringTranslator.TranslateEscapedChar(buf[1]));
77 pos++;
78 break;
91 break;
79 default:
92 default:
80 break;
93 break;
@@ -84,13 +97,5 namespace Implab.Formats.JSON {
84
97
85 throw new ParserException("Unexpected end of data");
98 throw new ParserException("Unexpected end of data");
86 }
99 }
87
88 void EnsureStringBufferSize(int size) {
89 if (size > m_stringBuffer.Length) {
90 var newBuffer = new char[size];
91 m_stringBuffer.CopyTo(newBuffer, 0);
92 m_stringBuffer = newBuffer;
93 }
100 }
94 }
101 }
95 }
96 }
@@ -1,5 +1,5
1 using Implab;
1 using Implab;
2 using Implab.Parsing;
2 using Implab.Formats;
3 using System;
3 using System;
4 using System.Collections.Generic;
4 using System.Collections.Generic;
5 using System.Diagnostics;
5 using System.Diagnostics;
@@ -7,11 +7,11 using System.Linq;
7 using System.Text;
7 using System.Text;
8 using System.Threading.Tasks;
8 using System.Threading.Tasks;
9
9
10 namespace Implab.JSON {
10 namespace Implab.Formats.JSON {
11 /// <summary>
11 /// <summary>
12 /// Класс для преобразования экранированной строки JSON
12 /// Класс для преобразования экранированной строки JSON
13 /// </summary>
13 /// </summary>
14 public class StringTranslator : Scanner {
14 public class StringTranslator : TextScanner<JSONGrammar.TokenType> {
15 static readonly char[] _escMap;
15 static readonly char[] _escMap;
16 static readonly int[] _hexMap;
16 static readonly int[] _hexMap;
17
17
@@ -34,8 +34,7 namespace Implab.JSON {
34
34
35 }
35 }
36
36
37 public StringTranslator()
37 public StringTranslator() {
38 : base(JSONGrammar.Instance.JsonStringDFA.States, JSONGrammar.Instance.JsonStringDFA.Alphabet.GetTranslationMap()) {
39 }
38 }
40
39
41 public string Translate(string data) {
40 public string Translate(string data) {
@@ -59,7 +58,7 namespace Implab.JSON {
59 int pos = 0;
58 int pos = 0;
60
59
61 while (ReadTokenInternal()) {
60 while (ReadTokenInternal()) {
62 switch ((JSONGrammar.TokenType)TokenTags[0]) {
61 switch ((JSONGrammar.TokenType)Tags[0]) {
63 case JSONGrammar.TokenType.UnescapedChar:
62 case JSONGrammar.TokenType.UnescapedChar:
64 Array.Copy(m_buffer,m_tokenOffset,translated,pos,m_tokenLen);
63 Array.Copy(m_buffer,m_tokenOffset,translated,pos,m_tokenLen);
65 pos += m_tokenLen;
64 pos += m_tokenLen;
@@ -3,50 +3,146 using Implab.Components;
3 using Implab.Automaton.RegularExpressions;
3 using Implab.Automaton.RegularExpressions;
4 using System.Diagnostics;
4 using System.Diagnostics;
5 using Implab.Automaton;
5 using Implab.Automaton;
6 using System.IO;
7 using System.Text;
6
8
7 namespace Implab.Formats {
9 namespace Implab.Formats {
8 public abstract class TextScanner<TTag> : Disposable {
10 public abstract class TextScanner : Disposable {
11 readonly int m_bufferMax;
12 readonly int m_chunkSize;
9
13
10 int m_maxSymbol;
14 char[] m_buffer;
11 int[] m_symbolMap;
12
13 readonly char[] m_buffer;
14 int m_bufferOffset;
15 int m_bufferOffset;
15 int m_bufferSize;
16 int m_bufferSize;
17 int m_tokenOffset;
16 int m_tokenLength;
18 int m_tokenLength;
17
19
18 TTag[] m_tags;
20 /// <summary>
21 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class.
22 /// </summary>
23 /// <param name="bufferMax">Buffer max.</param>
24 /// <param name="chunkSize">Chunk size.</param>
25 protected TextScanner(int bufferMax, int chunkSize) {
26 Debug.Assert(m_chunkSize <= m_bufferMax);
27
28 m_bufferMax = bufferMax;
29 m_chunkSize = chunkSize;
30 }
19
31
20 protected bool ReadTokenInternal(DFAStateDescriptor<TTag>[] dfa, int state) {
32 /// <summary>
21 Debug.Assert(dfa != null);
33 /// Initializes a new instance of the <see cref="Implab.Formats.TextScanner`1"/> class.
34 /// </summary>
35 /// <param name="buffer">Buffer.</param>
36 protected TextScanner(char[] buffer) {
37 if (buffer != null) {
38 m_buffer = buffer;
39 m_bufferSize = buffer.Length;
40 }
41 }
42
43 /// <summary>
44 /// (hungry) Reads the next token.
45 /// </summary>
46 /// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns>
47 /// <param name="dfa">The transition map for the automaton</param>
48 /// <param name="final">Final states of the automaton.</param>
49 /// <param name="tags">Tags.</param>
50 /// <param name="state">The initial state for the automaton.</param>
51 internal bool ReadToken<TTag>(int[,] dfa, int[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) {
52 Safe.ArgumentNotNull();
53 m_tokenLength = 0;
54
55 var maxSymbol = alphabet.Length - 1;
22
56
23 do {
57 do {
24 for (var pos = m_bufferOffset; pos < m_bufferSize; pos++) {
58 // after the next chunk is read the offset in the buffer may change
59 int pos = m_bufferOffset + m_tokenLength;
60
61 while(pos < m_bufferSize) {
25 var ch = m_buffer[pos];
62 var ch = m_buffer[pos];
26 state = dfa[state].transitions[m_symbolMap[ch > m_maxSymbol ? m_maxSymbol : ch]];
63
64 state = dfa[state,ch > maxSymbol ? DFAConst.UNCLASSIFIED_INPUT : alphabet[ch]];
27 if (state == DFAConst.UNREACHABLE_STATE)
65 if (state == DFAConst.UNREACHABLE_STATE)
28 break;
66 break;
67
68 pos++;
29 }
69 }
30 } while (Feed());
70
71 m_tokenLength = pos - m_bufferOffset;
72 } while (state != DFAConst.UNREACHABLE_STATE && Feed());
73
74 m_tokenOffset = m_bufferOffset;
75 m_bufferOffset += m_tokenLength;
31
76
32 if (dfa[state].final) {
77 if (final[state]) {
78 tag = tags[state];
79 return true;
80 } else {
81 if (m_bufferOffset == m_bufferSize) {
82 if (m_tokenLength == 0) //EOF
83 return false;
84
85 throw new ParserException();
86 }
87 throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset]));
33
88
34 }
89 }
90 }
35
91
92 protected void Feed(char[] buffer, int offset, int length) {
93 m_buffer = buffer;
94 m_bufferOffset = offset;
95 m_bufferSize = offset + length;
36 }
96 }
37
97
38 bool Feed() {
98 protected bool Feed() {
99 if (m_chunkSize <= 0)
100 return false;
101
102 if (m_buffer != null) {
103 var free = m_buffer.Length - m_bufferSize;
104
105 if (free < m_chunkSize) {
106 free += m_chunkSize;
107 var used = m_bufferSize - m_bufferOffset;
108 var size = used + free;
109
110 if (size > m_bufferMax)
111 throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached"), m_bufferMax/1024);
112
113 var temp = new char[size];
39
114
115 var read = Read(temp, used, m_chunkSize);
116 if (read == 0)
117 return false;
118
119 Array.Copy(m_buffer, m_bufferOffset, temp, 0, used);
120
121 m_bufferOffset = 0;
122 m_bufferSize = used + read;
123 m_buffer = temp;
124 }
125 } else {
126 Debug.Assert(m_bufferOffset == 0);
127 m_buffer = new char[m_chunkSize];
128 m_bufferSize = Read(m_buffer, 0, m_chunkSize);
129 return (m_bufferSize != 0);
130 }
40 }
131 }
41
132
42 protected abstract int Read(char[] buffer, int offset, int size);
133 protected abstract int Read(char[] buffer, int offset, int size);
43
134
44 protected TTag[] Tags {
135 public string GetTokenValue() {
45 get {
136 return new String(m_buffer, m_tokenOffset, m_tokenLength);
46 return m_tags;
47 }
48 }
137 }
49
138
139 public void CopyTokenTo(char[] buffer, int offset) {
140 m_buffer.CopyTo(buffer, offset);
141 }
142
143 public void CopyTokenTo(StringBuilder sb) {
144 sb.Append(m_buffer, m_tokenOffset, m_tokenLength);
145 }
50
146
51 }
147 }
52 }
148 }
@@ -151,11 +151,9
151 <Compile Include="Components\ExecutionState.cs" />
151 <Compile Include="Components\ExecutionState.cs" />
152 <Compile Include="Components\RunnableComponent.cs" />
152 <Compile Include="Components\RunnableComponent.cs" />
153 <Compile Include="Components\IFactory.cs" />
153 <Compile Include="Components\IFactory.cs" />
154 <Compile Include="Automaton\DFAStateDescriptor.cs" />
155 <Compile Include="Automaton\EnumAlphabet.cs" />
154 <Compile Include="Automaton\EnumAlphabet.cs" />
156 <Compile Include="Automaton\IAlphabet.cs" />
155 <Compile Include="Automaton\IAlphabet.cs" />
157 <Compile Include="Automaton\ParserException.cs" />
156 <Compile Include="Automaton\ParserException.cs" />
158 <Compile Include="Automaton\Scanner.cs" />
159 <Compile Include="Automaton\IndexedAlphabetBase.cs" />
157 <Compile Include="Automaton\IndexedAlphabetBase.cs" />
160 <Compile Include="Automaton\IAlphabetBuilder.cs" />
158 <Compile Include="Automaton\IAlphabetBuilder.cs" />
161 <Compile Include="Automaton\RegularExpressions\AltToken.cs" />
159 <Compile Include="Automaton\RegularExpressions\AltToken.cs" />
@@ -190,9 +188,10
190 <Compile Include="Automaton\RegularExpressions\RegularDFA.cs" />
188 <Compile Include="Automaton\RegularExpressions\RegularDFA.cs" />
191 <Compile Include="Automaton\RegularExpressions\RegularExpressionVisitor.cs" />
189 <Compile Include="Automaton\RegularExpressions\RegularExpressionVisitor.cs" />
192 <Compile Include="Automaton\RegularExpressions\ITaggedDFABuilder.cs" />
190 <Compile Include="Automaton\RegularExpressions\ITaggedDFABuilder.cs" />
193 <Compile Include="Automaton\RegularExpressions\DFAStateDescriptorT.cs" />
194 <Compile Include="Formats\BufferScanner.cs" />
195 <Compile Include="Formats\TextScanner.cs" />
191 <Compile Include="Formats\TextScanner.cs" />
192 <Compile Include="Formats\StringScanner.cs" />
193 <Compile Include="Formats\ReaderScanner.cs" />
194 <Compile Include="Formats\ScannerContext.cs" />
196 </ItemGroup>
195 </ItemGroup>
197 <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
196 <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
198 <ItemGroup />
197 <ItemGroup />
1 NO CONTENT: file was removed
NO CONTENT: file was removed
1 NO CONTENT: file was removed
NO CONTENT: file was removed
1 NO CONTENT: file was removed
NO CONTENT: file was removed
1 NO CONTENT: file was removed
NO CONTENT: file was removed
General Comments 0
You need to be logged in to leave comments. Login now