JsonGrammar.cs
148 lines
| 6.1 KiB
| text/x-csharp
|
CSharpLexer
cin
|
r230 | using System.Linq; | ||
using Implab.Automaton.RegularExpressions; | ||||
using System; | ||||
using Implab.Automaton; | ||||
using Implab.Components; | ||||
namespace Implab.Formats.Json { | ||||
public class JsonGrammar : Grammar<char> { | ||||
public enum TokenType { | ||||
None, | ||||
BeginObject, | ||||
EndObject, | ||||
BeginArray, | ||||
EndArray, | ||||
String, | ||||
Number, | ||||
Literal, | ||||
NameSeparator, | ||||
ValueSeparator, | ||||
Whitespace, | ||||
StringBound, | ||||
EscapedChar, | ||||
UnescapedChar, | ||||
EscapedUnicode | ||||
} | ||||
static LazyAndWeak<JsonGrammar> _instance = new LazyAndWeak<JsonGrammar>(() => new JsonGrammar()); | ||||
public static JsonGrammar Instance { | ||||
get { return _instance.Value; } | ||||
} | ||||
readonly InputScanner<TokenType> m_jsonExpression; | ||||
readonly InputScanner<TokenType> m_stringExpression; | ||||
readonly CharAlphabet m_defaultAlphabet = new CharAlphabet(); | ||||
public CharAlphabet DefaultAlphabet { get { return m_defaultAlphabet; } } | ||||
public JsonGrammar() { | ||||
DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x)); | ||||
var hexDigit = SymbolRangeToken('a','f').Or(SymbolRangeToken('A','F')).Or(SymbolRangeToken('0','9')); | ||||
var digit9 = SymbolRangeToken('1', '9'); | ||||
var zero = SymbolToken('0'); | ||||
var digit = zero.Or(digit9); | ||||
var dot = SymbolToken('.'); | ||||
var minus = SymbolToken('-'); | ||||
var sign = SymbolSetToken('-', '+'); | ||||
var expSign = SymbolSetToken('e', 'E'); | ||||
var letters = SymbolRangeToken('a', 'z'); | ||||
var integer = zero.Or(digit9.Cat(digit.EClosure())); | ||||
var frac = dot.Cat(digit.Closure()); | ||||
var exp = expSign.Cat(sign.Optional()).Cat(digit.Closure()); | ||||
var quote = SymbolToken('"'); | ||||
var backSlash = SymbolToken('\\'); | ||||
var specialEscapeChars = SymbolSetToken('\\', '"', '/', 'b', 'f', 't', 'n', 'r'); | ||||
var unicodeEspace = SymbolToken('u').Cat(hexDigit.Repeat(4)); | ||||
var whitespace = SymbolSetToken('\n', '\r', '\t', ' ').EClosure(); | ||||
var beginObject = whitespace.Cat(SymbolToken('{')).Cat(whitespace); | ||||
var endObject = whitespace.Cat(SymbolToken('}')).Cat(whitespace); | ||||
var beginArray = whitespace.Cat(SymbolToken('[')).Cat(whitespace); | ||||
var endArray = whitespace.Cat(SymbolToken(']')).Cat(whitespace); | ||||
var nameSep = whitespace.Cat(SymbolToken(':')).Cat(whitespace); | ||||
var valueSep = whitespace.Cat(SymbolToken(',')).Cat(whitespace); | ||||
var number = minus.Optional().Cat(integer).Cat(frac.Optional()).Cat(exp.Optional()); | ||||
var literal = letters.Closure(); | ||||
var unescaped = SymbolTokenExcept(Enumerable.Range(0, 0x20).Union(new int[] { '\\', '"' }).Select(x => (char)x)); | ||||
var jsonExpression = | ||||
number.Tag(TokenType.Number) | ||||
.Or(literal.Tag(TokenType.Literal)) | ||||
.Or(quote.Tag(TokenType.StringBound)) | ||||
.Or(beginObject.Tag(TokenType.BeginObject)) | ||||
.Or(endObject.Tag(TokenType.EndObject)) | ||||
.Or(beginArray.Tag(TokenType.BeginArray)) | ||||
.Or(endArray.Tag(TokenType.EndArray)) | ||||
.Or(nameSep.Tag(TokenType.NameSeparator)) | ||||
.Or(valueSep.Tag(TokenType.ValueSeparator)) | ||||
.Or(SymbolSetToken('\n', '\r', '\t', ' ').Closure().Tag(TokenType.Whitespace)); | ||||
var jsonStringExpression = | ||||
quote.Tag(TokenType.StringBound) | ||||
.Or(backSlash.Cat(specialEscapeChars).Tag(TokenType.EscapedChar)) | ||||
.Or(backSlash.Cat(unicodeEspace).Tag(TokenType.EscapedUnicode)) | ||||
.Or(unescaped.Closure().Tag(TokenType.UnescapedChar)); | ||||
m_jsonExpression = BuildScanner(jsonExpression); | ||||
m_stringExpression = BuildScanner(jsonStringExpression); | ||||
} | ||||
public static InputScanner<TokenType> CreateJsonExpressionScanner() { | ||||
return Instance.m_jsonExpression.Clone(); | ||||
} | ||||
public static InputScanner<TokenType> CreateStringExpressionScanner() { | ||||
return Instance.m_stringExpression.Clone(); | ||||
} | ||||
protected override IAlphabetBuilder<char> AlphabetBuilder { | ||||
get { | ||||
return m_defaultAlphabet; | ||||
} | ||||
} | ||||
Token SymbolRangeToken(char start, char stop) { | ||||
return SymbolToken(Enumerable.Range(start, stop - start + 1).Select(x => (char)x)); | ||||
} | ||||
public InputScanner<TokenType> BuildScanner(Token regexp) { | ||||
var dfa = new RegularDFA<char, TokenType>(AlphabetBuilder); | ||||
var visitor = new RegularExpressionVisitor<TokenType>(dfa); | ||||
regexp.Accept(visitor); | ||||
visitor.BuildDFA(); | ||||
if (dfa.IsFinalState(dfa.InitialState)) | ||||
throw new ApplicationException("The specified language contains empty token"); | ||||
var ab = new CharAlphabet(); | ||||
var optimal = dfa.Optimize(ab); | ||||
return new InputScanner<TokenType>( | ||||
optimal.CreateTransitionTable(), | ||||
optimal.CreateFinalStateTable(), | ||||
NormalizeTags(optimal.CreateTagTable()), | ||||
optimal.InitialState, | ||||
ab.CreateCharMap() | ||||
); | ||||
} | ||||
static TokenType[] NormalizeTags(TokenType[][] tags) { | ||||
var result = new TokenType[tags.Length]; | ||||
for(var i = 0; i< tags.Length; i++) { | ||||
if (tags[i] == null || tags[i].Length == 0) | ||||
result[i] = default(TokenType); | ||||
else if (tags[i].Length == 1) | ||||
result[i] = tags[i][0]; | ||||
else | ||||
throw new Exception($"Ambigous state tags {string.Join(", ", tags[i])}"); | ||||
} | ||||
return result; | ||||
} | ||||
} | ||||
} | ||||