using System.Linq; using Implab.Automaton.RegularExpressions; using System; using Implab.Automaton; using Implab.Components; namespace Implab.Formats.Json { public class JsonGrammar : Grammar { public enum TokenType { None, BeginObject, EndObject, BeginArray, EndArray, String, Number, Literal, NameSeparator, ValueSeparator, Whitespace, StringBound, EscapedChar, UnescapedChar, EscapedUnicode } static LazyAndWeak _instance = new LazyAndWeak(() => new JsonGrammar()); public static JsonGrammar Instance { get { return _instance.Value; } } readonly FastInputScanner m_jsonExpression; readonly FastInputScanner m_stringExpression; readonly CharAlphabet m_defaultAlphabet = new CharAlphabet(); public CharAlphabet DefaultAlphabet { get { return m_defaultAlphabet; } } public JsonGrammar() { DefineAlphabet(Enumerable.Range(0, 0x20).Select(x => (char)x)); var hexDigit = SymbolRangeToken('a','f').Or(SymbolRangeToken('A','F')).Or(SymbolRangeToken('0','9')); var digit9 = SymbolRangeToken('1', '9'); var zero = SymbolToken('0'); var digit = zero.Or(digit9); var dot = SymbolToken('.'); var minus = SymbolToken('-'); var sign = SymbolSetToken('-', '+'); var expSign = SymbolSetToken('e', 'E'); var letters = SymbolRangeToken('a', 'z'); var integer = zero.Or(digit9.Cat(digit.EClosure())); var frac = dot.Cat(digit.Closure()); var exp = expSign.Cat(sign.Optional()).Cat(digit.Closure()); var quote = SymbolToken('"'); var backSlash = SymbolToken('\\'); var specialEscapeChars = SymbolSetToken('\\', '"', '/', 'b', 'f', 't', 'n', 'r'); var unicodeEspace = SymbolToken('u').Cat(hexDigit.Repeat(4)); var whitespace = SymbolSetToken('\n', '\r', '\t', ' ').EClosure(); var beginObject = whitespace.Cat(SymbolToken('{')).Cat(whitespace); var endObject = whitespace.Cat(SymbolToken('}')).Cat(whitespace); var beginArray = whitespace.Cat(SymbolToken('[')).Cat(whitespace); var endArray = whitespace.Cat(SymbolToken(']')).Cat(whitespace); var nameSep = whitespace.Cat(SymbolToken(':')).Cat(whitespace); var valueSep = whitespace.Cat(SymbolToken(',')).Cat(whitespace); var number = minus.Optional().Cat(integer).Cat(frac.Optional()).Cat(exp.Optional()); var literal = letters.Closure(); var unescaped = SymbolTokenExcept(Enumerable.Range(0, 0x20).Union(new int[] { '\\', '"' }).Select(x => (char)x)); var jsonExpression = number.Tag(TokenType.Number) .Or(literal.Tag(TokenType.Literal)) .Or(quote.Tag(TokenType.StringBound)) .Or(beginObject.Tag(TokenType.BeginObject)) .Or(endObject.Tag(TokenType.EndObject)) .Or(beginArray.Tag(TokenType.BeginArray)) .Or(endArray.Tag(TokenType.EndArray)) .Or(nameSep.Tag(TokenType.NameSeparator)) .Or(valueSep.Tag(TokenType.ValueSeparator)) .Or(SymbolSetToken('\n', '\r', '\t', ' ').Closure().Tag(TokenType.Whitespace)); var jsonStringExpression = quote.Tag(TokenType.StringBound) .Or(backSlash.Cat(specialEscapeChars).Tag(TokenType.EscapedChar)) .Or(backSlash.Cat(unicodeEspace).Tag(TokenType.EscapedUnicode)) .Or(unescaped.Closure().Tag(TokenType.UnescapedChar)); m_jsonExpression = BuildFastScanner(jsonExpression); m_stringExpression = BuildFastScanner(jsonStringExpression); } public static FastInputScanner CreateJsonExpressionScanner() { return Instance.m_jsonExpression.Clone(); } public static FastInputScanner CreateStringExpressionScanner() { return Instance.m_stringExpression.Clone(); } protected override IAlphabetBuilder AlphabetBuilder { get { return m_defaultAlphabet; } } Token SymbolRangeToken(char start, char stop) { return SymbolToken(Enumerable.Range(start, stop - start + 1).Select(x => (char)x)); } public FastInputScanner BuildFastScanner(Token regexp) { var dfa = new RegularDFA(AlphabetBuilder); var visitor = new RegularExpressionVisitor(dfa); regexp.Accept(visitor); visitor.BuildDFA(); if (dfa.IsFinalState(dfa.InitialState)) throw new ApplicationException("The specified language contains empty token"); var ab = new CharAlphabet(); var optimal = dfa.Optimize(ab); return new FastInputScanner( optimal.CreateTransitionTable(), optimal.CreateFinalStateTable(), NormalizeTags(optimal.CreateTagTable()), optimal.InitialState, ab.GetTranslationMap() ); } static TokenType[] NormalizeTags(TokenType[][] tags) { var result = new TokenType[tags.Length]; for(var i = 0; i< tags.Length; i++) { if (tags[i] == null || tags[i].Length == 0) result[i] = default(TokenType); else if (tags[i].Length == 1) result[i] = tags[i][0]; else throw new Exception($"Ambigous state tags {string.Join(", ", tags[i])}"); } return result; } } }