|
|
using System;
|
|
|
using Implab.Components;
|
|
|
using System.Diagnostics;
|
|
|
using Implab.Automaton;
|
|
|
using System.Text;
|
|
|
|
|
|
namespace Implab.Formats {
|
|
|
public abstract class TextScanner : Disposable {
|
|
|
readonly int m_bufferMax;
|
|
|
readonly int m_chunkSize;
|
|
|
|
|
|
char[] m_buffer;
|
|
|
int m_bufferOffset;
|
|
|
int m_bufferSize;
|
|
|
int m_tokenOffset;
|
|
|
int m_tokenLength;
|
|
|
|
|
|
/// <summary>
|
|
|
/// Initializes a new instance of the <see cref="Implab.Formats.TextScanner"/> class.
|
|
|
/// </summary>
|
|
|
/// <param name="bufferMax">Buffer max.</param>
|
|
|
/// <param name="chunkSize">Chunk size.</param>
|
|
|
protected TextScanner(int bufferMax, int chunkSize) {
|
|
|
Debug.Assert(m_chunkSize <= m_bufferMax);
|
|
|
|
|
|
m_bufferMax = bufferMax;
|
|
|
m_chunkSize = chunkSize;
|
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
|
/// Initializes a new instance of the <see cref="Implab.Formats.TextScanner"/> class.
|
|
|
/// </summary>
|
|
|
/// <param name="buffer">Buffer.</param>
|
|
|
protected TextScanner(char[] buffer) {
|
|
|
if (buffer != null) {
|
|
|
m_buffer = buffer;
|
|
|
m_bufferSize = buffer.Length;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
|
/// (hungry) Reads the next token.
|
|
|
/// </summary>
|
|
|
/// <returns><c>true</c>, if token internal was read, <c>false</c> if there is no more tokens in the stream.</returns>
|
|
|
/// <param name="dfa">The transition map for the automaton</param>
|
|
|
/// <param name="final">Final states of the automaton.</param>
|
|
|
/// <param name="tags">Tags.</param>
|
|
|
/// <param name="state">The initial state for the automaton.</param>
|
|
|
/// <param name="alphabet"></param>
|
|
|
/// <param name = "tag"></param>
|
|
|
internal bool ReadToken<TTag>(int[,] dfa, bool[] final, TTag[][] tags, int state, int[] alphabet, out TTag[] tag) {
|
|
|
Safe.ArgumentNotNull();
|
|
|
m_tokenLength = 0;
|
|
|
|
|
|
var maxSymbol = alphabet.Length - 1;
|
|
|
|
|
|
do {
|
|
|
// after the next chunk is read the offset in the buffer may change
|
|
|
int pos = m_bufferOffset + m_tokenLength;
|
|
|
|
|
|
while (pos < m_bufferSize) {
|
|
|
var ch = m_buffer[pos];
|
|
|
|
|
|
state = dfa[state, ch > maxSymbol ? DFAConst.UNCLASSIFIED_INPUT : alphabet[ch]];
|
|
|
if (state == DFAConst.UNREACHABLE_STATE)
|
|
|
break;
|
|
|
|
|
|
pos++;
|
|
|
}
|
|
|
|
|
|
m_tokenLength = pos - m_bufferOffset;
|
|
|
} while (state != DFAConst.UNREACHABLE_STATE && Feed());
|
|
|
|
|
|
m_tokenOffset = m_bufferOffset;
|
|
|
m_bufferOffset += m_tokenLength;
|
|
|
|
|
|
if (final[state]) {
|
|
|
tag = tags[state];
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
if (m_bufferOffset == m_bufferSize) {
|
|
|
if (m_tokenLength == 0) //EOF
|
|
|
return false;
|
|
|
|
|
|
throw new ParserException();
|
|
|
}
|
|
|
|
|
|
throw new ParserException(String.Format("Unexpected symbol '{0}'", m_buffer[m_bufferOffset]));
|
|
|
|
|
|
}
|
|
|
|
|
|
protected void Feed(char[] buffer, int offset, int length) {
|
|
|
m_buffer = buffer;
|
|
|
m_bufferOffset = offset;
|
|
|
m_bufferSize = offset + length;
|
|
|
}
|
|
|
|
|
|
protected bool Feed() {
|
|
|
if (m_chunkSize <= 0)
|
|
|
return false;
|
|
|
|
|
|
if (m_buffer != null) {
|
|
|
var free = m_buffer.Length - m_bufferSize;
|
|
|
|
|
|
if (free < m_chunkSize) {
|
|
|
free += m_chunkSize;
|
|
|
var used = m_bufferSize - m_bufferOffset;
|
|
|
var size = used + free;
|
|
|
|
|
|
if (size > m_bufferMax)
|
|
|
throw new ParserException(String.Format("The buffer limit ({0} Kb) is reached", m_bufferMax/1024));
|
|
|
|
|
|
var temp = new char[size];
|
|
|
|
|
|
var read = Read(temp, used, m_chunkSize);
|
|
|
if (read == 0)
|
|
|
return false;
|
|
|
|
|
|
Array.Copy(m_buffer, m_bufferOffset, temp, 0, used);
|
|
|
|
|
|
m_bufferOffset = 0;
|
|
|
m_bufferSize = used + read;
|
|
|
m_buffer = temp;
|
|
|
}
|
|
|
} else {
|
|
|
Debug.Assert(m_bufferOffset == 0);
|
|
|
m_buffer = new char[m_chunkSize];
|
|
|
m_bufferSize = Read(m_buffer, 0, m_chunkSize);
|
|
|
return (m_bufferSize != 0);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
protected abstract int Read(char[] buffer, int offset, int size);
|
|
|
|
|
|
public string GetTokenValue() {
|
|
|
return new String(m_buffer, m_tokenOffset, m_tokenLength);
|
|
|
}
|
|
|
|
|
|
public void CopyTokenTo(char[] buffer, int offset) {
|
|
|
m_buffer.CopyTo(buffer, offset);
|
|
|
}
|
|
|
|
|
|
public void CopyTokenTo(StringBuilder sb) {
|
|
|
sb.Append(m_buffer, m_tokenOffset, m_tokenLength);
|
|
|
}
|
|
|
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|