From c5d7d71a51d6c64cbba1a4e4a6055a205fb330df Mon Sep 17 00:00:00 2001 From: Jan Date: Wed, 3 Nov 2021 22:02:04 +0100 Subject: [PATCH] Add multicharacter tokens to simple lexer --- .../LocalizeFile/LocalizeFileReader.cpp | 7 +- .../Parsing/Menu/MenuFileReader.cpp | 7 +- src/ObjWriting/Menu/AbstractMenuDumper.cpp | 7 +- .../Simple/Matcher/SimpleMatcherFactory.cpp | 6 ++ .../Simple/Matcher/SimpleMatcherFactory.h | 1 + .../Matcher/SimpleMatcherMultiCharacter.cpp | 14 ++++ .../Matcher/SimpleMatcherMultiCharacter.h | 15 ++++ src/Parser/Parsing/Simple/SimpleLexer.cpp | 84 +++++++++++++++++-- src/Parser/Parsing/Simple/SimpleLexer.h | 32 ++++++- .../Parsing/Simple/SimpleParserValue.cpp | 13 +++ src/Parser/Parsing/Simple/SimpleParserValue.h | 6 +- 11 files changed, 181 insertions(+), 11 deletions(-) create mode 100644 src/Parser/Parsing/Simple/Matcher/SimpleMatcherMultiCharacter.cpp create mode 100644 src/Parser/Parsing/Simple/Matcher/SimpleMatcherMultiCharacter.h diff --git a/src/ObjLoading/Parsing/LocalizeFile/LocalizeFileReader.cpp b/src/ObjLoading/Parsing/LocalizeFile/LocalizeFileReader.cpp index dde66961..1a4af3fb 100644 --- a/src/ObjLoading/Parsing/LocalizeFile/LocalizeFileReader.cpp +++ b/src/ObjLoading/Parsing/LocalizeFile/LocalizeFileReader.cpp @@ -29,7 +29,12 @@ void LocalizeFileReader::SetupStreamProxies() std::vector LocalizeFileReader::ReadLocalizeFile() { - const auto lexer = std::make_unique(m_stream, SimpleLexer::Config{true, true, false}); + SimpleLexer::Config lexerConfig; + lexerConfig.m_emit_new_line_tokens = true; + lexerConfig.m_read_strings = true; + lexerConfig.m_read_numbers = false; + const auto lexer = std::make_unique(m_stream, std::move(lexerConfig)); + const auto parser = std::make_unique(lexer.get(), m_language); if (parser->Parse()) diff --git a/src/ObjLoading/Parsing/Menu/MenuFileReader.cpp b/src/ObjLoading/Parsing/Menu/MenuFileReader.cpp index 9e7a51f0..dabf469c 100644 --- a/src/ObjLoading/Parsing/Menu/MenuFileReader.cpp +++ b/src/ObjLoading/Parsing/Menu/MenuFileReader.cpp @@ -111,7 +111,12 @@ std::unique_ptr MenuFileReader::CreateParsingResult(MenuFileParse std::unique_ptr MenuFileReader::ReadMenuFile() { - const auto lexer = std::make_unique(m_stream, SimpleLexer::Config{false, true, true}); + SimpleLexer::Config lexerConfig; + lexerConfig.m_emit_new_line_tokens = false; + lexerConfig.m_read_strings = true; + lexerConfig.m_read_numbers = true; + const auto lexer = std::make_unique(m_stream, std::move(lexerConfig)); + const auto parser = std::make_unique(lexer.get(), m_feature_level); if (!parser->Parse()) diff --git a/src/ObjWriting/Menu/AbstractMenuDumper.cpp b/src/ObjWriting/Menu/AbstractMenuDumper.cpp index 104ed5cb..673cb44a 100644 --- a/src/ObjWriting/Menu/AbstractMenuDumper.cpp +++ b/src/ObjWriting/Menu/AbstractMenuDumper.cpp @@ -66,7 +66,12 @@ std::vector AbstractMenuDumper::CreateScriptTokenList(const char* s const std::string scriptString(script); std::istringstream stringStream(scriptString); ParserSingleInputStream inputStream(stringStream, "MenuScript"); - SimpleLexer lexer(&inputStream, SimpleLexer::Config{false, true, false}); + + SimpleLexer::Config lexerConfig; + lexerConfig.m_emit_new_line_tokens = false; + lexerConfig.m_read_strings = true; + lexerConfig.m_read_numbers = false; + SimpleLexer lexer(&inputStream, std::move(lexerConfig)); std::vector result; auto hasLexerTokens = true; diff --git a/src/Parser/Parsing/Simple/Matcher/SimpleMatcherFactory.cpp b/src/Parser/Parsing/Simple/Matcher/SimpleMatcherFactory.cpp index 73a86094..5c5a881b 100644 --- a/src/Parser/Parsing/Simple/Matcher/SimpleMatcherFactory.cpp +++ b/src/Parser/Parsing/Simple/Matcher/SimpleMatcherFactory.cpp @@ -5,6 +5,7 @@ #include "SimpleMatcherKeyword.h" #include "SimpleMatcherKeywordIgnoreCase.h" #include "SimpleMatcherKeywordPrefix.h" +#include "SimpleMatcherMultiCharacter.h" #include "SimpleMatcherValueType.h" SimpleMatcherFactory::SimpleMatcherFactory(const IMatcherForLabelSupplier* labelSupplier) @@ -57,6 +58,11 @@ MatcherFactoryWrapper SimpleMatcherFactory::Char(char c) cons return MatcherFactoryWrapper(std::make_unique(c)); } +MatcherFactoryWrapper SimpleMatcherFactory::MultiChar(int multiCharacterSequenceId) const +{ + return MatcherFactoryWrapper(std::make_unique(multiCharacterSequenceId)); +} + MatcherFactoryWrapper SimpleMatcherFactory::AnyCharBesides(std::vector chars) const { return MatcherFactoryWrapper(std::make_unique(std::move(chars))); diff --git a/src/Parser/Parsing/Simple/Matcher/SimpleMatcherFactory.h b/src/Parser/Parsing/Simple/Matcher/SimpleMatcherFactory.h index 9841a93e..8abd31b4 100644 --- a/src/Parser/Parsing/Simple/Matcher/SimpleMatcherFactory.h +++ b/src/Parser/Parsing/Simple/Matcher/SimpleMatcherFactory.h @@ -19,5 +19,6 @@ public: _NODISCARD MatcherFactoryWrapper Integer() const; _NODISCARD MatcherFactoryWrapper FloatingPoint() const; _NODISCARD MatcherFactoryWrapper Char(char c) const; + _NODISCARD MatcherFactoryWrapper MultiChar(int multiCharacterSequenceId) const; _NODISCARD MatcherFactoryWrapper AnyCharBesides(std::vector chars) const; }; diff --git a/src/Parser/Parsing/Simple/Matcher/SimpleMatcherMultiCharacter.cpp b/src/Parser/Parsing/Simple/Matcher/SimpleMatcherMultiCharacter.cpp new file mode 100644 index 00000000..8cbcc966 --- /dev/null +++ b/src/Parser/Parsing/Simple/Matcher/SimpleMatcherMultiCharacter.cpp @@ -0,0 +1,14 @@ +#include "SimpleMatcherMultiCharacter.h" + +SimpleMatcherMultiCharacter::SimpleMatcherMultiCharacter(const int multiCharacterSequenceId) + : m_multi_character_sequence_id(multiCharacterSequenceId) +{ +} + +MatcherResult SimpleMatcherMultiCharacter::CanMatch(ILexer* lexer, const unsigned tokenOffset) +{ + const auto& token = lexer->GetToken(tokenOffset); + return token.m_type == SimpleParserValueType::MULTI_CHARACTER && token.MultiCharacterValue() == m_multi_character_sequence_id + ? MatcherResult::Match(1) + : MatcherResult::NoMatch(); +} diff --git a/src/Parser/Parsing/Simple/Matcher/SimpleMatcherMultiCharacter.h b/src/Parser/Parsing/Simple/Matcher/SimpleMatcherMultiCharacter.h new file mode 100644 index 00000000..cb68fe6e --- /dev/null +++ b/src/Parser/Parsing/Simple/Matcher/SimpleMatcherMultiCharacter.h @@ -0,0 +1,15 @@ +#pragma once + +#include "Parsing/Simple/SimpleParserValue.h" +#include "Parsing/Matcher/AbstractMatcher.h" + +class SimpleMatcherMultiCharacter final : public AbstractMatcher +{ + int m_multi_character_sequence_id; + +protected: + MatcherResult CanMatch(ILexer* lexer, unsigned tokenOffset) override; + +public: + explicit SimpleMatcherMultiCharacter(int multiCharacterSequenceId); +}; diff --git a/src/Parser/Parsing/Simple/SimpleLexer.cpp b/src/Parser/Parsing/Simple/SimpleLexer.cpp index a63dd201..f71608e5 100644 --- a/src/Parser/Parsing/Simple/SimpleLexer.cpp +++ b/src/Parser/Parsing/Simple/SimpleLexer.cpp @@ -1,17 +1,77 @@ #include "SimpleLexer.h" +SimpleLexer::Config::MultiCharacterToken::MultiCharacterToken(const int id, std::string value) + : m_id(id), + m_value(std::move(value)) +{ +} + +SimpleLexer::MultiCharacterTokenLookupEntry::MultiCharacterTokenLookupEntry(const int id, std::string value) + : m_id(id), + m_value(std::move(value)) +{ +} + SimpleLexer::SimpleLexer(IParserLineStream* stream) : AbstractLexer(stream), - m_config{false, true, true}, - m_last_line(1) + m_config{false, true, true, {}}, + m_check_for_multi_character_tokens(false), + m_last_line(1) { } SimpleLexer::SimpleLexer(IParserLineStream* stream, Config config) : AbstractLexer(stream), - m_config(config), - m_last_line(1) + m_config(std::move(config)), + m_check_for_multi_character_tokens(false), + m_last_line(1) { + for (auto tokenConfig : m_config.m_multi_character_tokens) + AddMultiCharacterTokenConfigToLookup(std::move(tokenConfig)); + m_config.m_multi_character_tokens.clear(); +} + +void SimpleLexer::AddMultiCharacterTokenConfigToLookup(Config::MultiCharacterToken tokenConfig) +{ + if (tokenConfig.m_value.empty()) + return; + + m_check_for_multi_character_tokens = true; + const auto firstCharacterValue = static_cast(tokenConfig.m_value[0]); + + if (m_multi_character_token_lookup[firstCharacterValue]) + { + auto* currentEntry = m_multi_character_token_lookup[firstCharacterValue].get(); + while (currentEntry->m_next) + currentEntry = currentEntry->m_next.get(); + + currentEntry->m_next = std::make_unique(tokenConfig.m_id, std::move(tokenConfig.m_value)); + } + else + { + m_multi_character_token_lookup[firstCharacterValue] = std::make_unique(tokenConfig.m_id, std::move(tokenConfig.m_value)); + } +} + +bool SimpleLexer::ReadMultiCharacterToken(const MultiCharacterTokenLookupEntry* multiTokenLookup) +{ + const auto& currentLine = CurrentLine(); + assert(m_current_line_offset >= 1); + assert(multiTokenLookup); + assert(!multiTokenLookup->m_value.empty()); + assert(currentLine.m_line[m_current_line_offset - 1] == multiTokenLookup->m_value[0]); + + const char* linePos = ¤tLine.m_line[m_current_line_offset - 1]; + + for (const auto c : multiTokenLookup->m_value) + { + if (!*linePos || *linePos != c) + return false; + linePos++; + } + + m_current_line_offset = m_current_line_offset - 1 + multiTokenLookup->m_value.size(); + return true; } SimpleParserValue SimpleLexer::GetNextToken() @@ -32,7 +92,7 @@ SimpleParserValue SimpleLexer::GetNextToken() NextChar(); c = PeekChar(); } - + const auto pos = GetNextCharacterPos(); if (m_config.m_emit_new_line_tokens && pos.m_line > m_last_line) { @@ -45,9 +105,21 @@ SimpleParserValue SimpleLexer::GetNextToken() if (c == EOF) return SimpleParserValue::EndOfFile(TokenPos()); + if(m_check_for_multi_character_tokens) + { + const auto* multiTokenLookup = m_multi_character_token_lookup[static_cast(c)].get(); + while(multiTokenLookup) + { + if(ReadMultiCharacterToken(multiTokenLookup)) + return SimpleParserValue::MultiCharacter(pos, multiTokenLookup->m_id); + + multiTokenLookup = multiTokenLookup->m_next.get(); + } + } + if (m_config.m_read_strings && c == '\"') return SimpleParserValue::String(GetPreviousCharacterPos(), new std::string(ReadString())); - + if (m_config.m_read_numbers && (isdigit(c) || c == '.' && isdigit(PeekChar()))) { bool isFloatingPointValue; diff --git a/src/Parser/Parsing/Simple/SimpleLexer.h b/src/Parser/Parsing/Simple/SimpleLexer.h index 2333f4bd..dceae153 100644 --- a/src/Parser/Parsing/Simple/SimpleLexer.h +++ b/src/Parser/Parsing/Simple/SimpleLexer.h @@ -1,5 +1,9 @@ #pragma once +#include +#include +#include + #include "SimpleParserValue.h" #include "Parsing/Impl/AbstractLexer.h" @@ -9,15 +13,41 @@ public: class Config { public: + class MultiCharacterToken + { + public: + int m_id; + std::string m_value; + + MultiCharacterToken(int id, std::string value); + }; + bool m_emit_new_line_tokens; bool m_read_strings; bool m_read_numbers; + std::vector m_multi_character_tokens; + }; + +protected: + class MultiCharacterTokenLookupEntry + { + public: + int m_id; + std::string m_value; + std::unique_ptr m_next; + + MultiCharacterTokenLookupEntry(int id, std::string value); }; Config m_config; + bool m_check_for_multi_character_tokens; int m_last_line; -protected: + std::unique_ptr m_multi_character_token_lookup[std::numeric_limits::max() + 1]; + + void AddMultiCharacterTokenConfigToLookup(Config::MultiCharacterToken tokenConfig); + bool ReadMultiCharacterToken(const MultiCharacterTokenLookupEntry* multiTokenLookup); + SimpleParserValue GetNextToken() override; public: diff --git a/src/Parser/Parsing/Simple/SimpleParserValue.cpp b/src/Parser/Parsing/Simple/SimpleParserValue.cpp index d79149e9..51be28c9 100644 --- a/src/Parser/Parsing/Simple/SimpleParserValue.cpp +++ b/src/Parser/Parsing/Simple/SimpleParserValue.cpp @@ -27,6 +27,13 @@ SimpleParserValue SimpleParserValue::Character(const TokenPos pos, const char c) return pv; } +SimpleParserValue SimpleParserValue::MultiCharacter(const TokenPos pos, const int multiCharacterSequenceId) +{ + SimpleParserValue pv(pos, SimpleParserValueType::MULTI_CHARACTER); + pv.m_value.multi_character_sequence_id = multiCharacterSequenceId; + return pv; +} + SimpleParserValue SimpleParserValue::Integer(const TokenPos pos, const int value) { SimpleParserValue pv(pos, SimpleParserValueType::INTEGER); @@ -116,6 +123,12 @@ char SimpleParserValue::CharacterValue() const return m_value.char_value; } +int SimpleParserValue::MultiCharacterValue() const +{ + assert(m_type == SimpleParserValueType::MULTI_CHARACTER); + return m_value.multi_character_sequence_id; +} + int SimpleParserValue::IntegerValue() const { assert(m_type == SimpleParserValueType::INTEGER); diff --git a/src/Parser/Parsing/Simple/SimpleParserValue.h b/src/Parser/Parsing/Simple/SimpleParserValue.h index eca0ae4e..520294f0 100644 --- a/src/Parser/Parsing/Simple/SimpleParserValue.h +++ b/src/Parser/Parsing/Simple/SimpleParserValue.h @@ -13,8 +13,9 @@ enum class SimpleParserValueType END_OF_FILE, NEW_LINE, - // Single character + // Character sequences CHARACTER, + MULTI_CHARACTER, // Generic token types INTEGER, @@ -36,6 +37,7 @@ public: { char char_value; int int_value; + int multi_character_sequence_id; double double_value; std::string* string_value; } m_value; @@ -44,6 +46,7 @@ public: static SimpleParserValue EndOfFile(TokenPos pos); static SimpleParserValue NewLine(TokenPos pos); static SimpleParserValue Character(TokenPos pos, char c); + static SimpleParserValue MultiCharacter(TokenPos pos, int multiCharacterSequenceId); static SimpleParserValue Integer(TokenPos pos, int value); static SimpleParserValue FloatingPoint(TokenPos pos, double value); static SimpleParserValue String(TokenPos pos, std::string* stringValue); @@ -63,6 +66,7 @@ public: _NODISCARD const TokenPos& GetPos() const override; _NODISCARD char CharacterValue() const; + _NODISCARD int MultiCharacterValue() const; _NODISCARD int IntegerValue() const; _NODISCARD double FloatingPointValue() const; _NODISCARD std::string& StringValue() const;