Add multicharacter tokens to simple lexer

This commit is contained in:
Jan 2021-11-03 22:02:04 +01:00
parent ab7b516918
commit c5d7d71a51
11 changed files with 181 additions and 11 deletions

View File

@ -29,7 +29,12 @@ void LocalizeFileReader::SetupStreamProxies()
std::vector<LocalizeFileEntry> LocalizeFileReader::ReadLocalizeFile()
{
const auto lexer = std::make_unique<SimpleLexer>(m_stream, SimpleLexer::Config{true, true, false});
SimpleLexer::Config lexerConfig;
lexerConfig.m_emit_new_line_tokens = true;
lexerConfig.m_read_strings = true;
lexerConfig.m_read_numbers = false;
const auto lexer = std::make_unique<SimpleLexer>(m_stream, std::move(lexerConfig));
const auto parser = std::make_unique<LocalizeFileParser>(lexer.get(), m_language);
if (parser->Parse())

View File

@ -111,7 +111,12 @@ std::unique_ptr<ParsingResult> MenuFileReader::CreateParsingResult(MenuFileParse
std::unique_ptr<ParsingResult> MenuFileReader::ReadMenuFile()
{
const auto lexer = std::make_unique<SimpleLexer>(m_stream, SimpleLexer::Config{false, true, true});
SimpleLexer::Config lexerConfig;
lexerConfig.m_emit_new_line_tokens = false;
lexerConfig.m_read_strings = true;
lexerConfig.m_read_numbers = true;
const auto lexer = std::make_unique<SimpleLexer>(m_stream, std::move(lexerConfig));
const auto parser = std::make_unique<MenuFileParser>(lexer.get(), m_feature_level);
if (!parser->Parse())

View File

@ -66,7 +66,12 @@ std::vector<std::string> AbstractMenuDumper::CreateScriptTokenList(const char* s
const std::string scriptString(script);
std::istringstream stringStream(scriptString);
ParserSingleInputStream inputStream(stringStream, "MenuScript");
SimpleLexer lexer(&inputStream, SimpleLexer::Config{false, true, false});
SimpleLexer::Config lexerConfig;
lexerConfig.m_emit_new_line_tokens = false;
lexerConfig.m_read_strings = true;
lexerConfig.m_read_numbers = false;
SimpleLexer lexer(&inputStream, std::move(lexerConfig));
std::vector<std::string> result;
auto hasLexerTokens = true;

View File

@ -5,6 +5,7 @@
#include "SimpleMatcherKeyword.h"
#include "SimpleMatcherKeywordIgnoreCase.h"
#include "SimpleMatcherKeywordPrefix.h"
#include "SimpleMatcherMultiCharacter.h"
#include "SimpleMatcherValueType.h"
SimpleMatcherFactory::SimpleMatcherFactory(const IMatcherForLabelSupplier<SimpleParserValue>* labelSupplier)
@ -57,6 +58,11 @@ MatcherFactoryWrapper<SimpleParserValue> SimpleMatcherFactory::Char(char c) cons
return MatcherFactoryWrapper<SimpleParserValue>(std::make_unique<SimpleMatcherCharacter>(c));
}
MatcherFactoryWrapper<SimpleParserValue> SimpleMatcherFactory::MultiChar(int multiCharacterSequenceId) const
{
return MatcherFactoryWrapper<SimpleParserValue>(std::make_unique<SimpleMatcherMultiCharacter>(multiCharacterSequenceId));
}
MatcherFactoryWrapper<SimpleParserValue> SimpleMatcherFactory::AnyCharBesides(std::vector<char> chars) const
{
return MatcherFactoryWrapper<SimpleParserValue>(std::make_unique<SimpleMatcherAnyCharacterBesides>(std::move(chars)));

View File

@ -19,5 +19,6 @@ public:
_NODISCARD MatcherFactoryWrapper<SimpleParserValue> Integer() const;
_NODISCARD MatcherFactoryWrapper<SimpleParserValue> FloatingPoint() const;
_NODISCARD MatcherFactoryWrapper<SimpleParserValue> Char(char c) const;
_NODISCARD MatcherFactoryWrapper<SimpleParserValue> MultiChar(int multiCharacterSequenceId) const;
_NODISCARD MatcherFactoryWrapper<SimpleParserValue> AnyCharBesides(std::vector<char> chars) const;
};

View File

@ -0,0 +1,14 @@
#include "SimpleMatcherMultiCharacter.h"
SimpleMatcherMultiCharacter::SimpleMatcherMultiCharacter(const int multiCharacterSequenceId)
: m_multi_character_sequence_id(multiCharacterSequenceId)
{
}
MatcherResult<SimpleParserValue> SimpleMatcherMultiCharacter::CanMatch(ILexer<SimpleParserValue>* lexer, const unsigned tokenOffset)
{
const auto& token = lexer->GetToken(tokenOffset);
return token.m_type == SimpleParserValueType::MULTI_CHARACTER && token.MultiCharacterValue() == m_multi_character_sequence_id
? MatcherResult<SimpleParserValue>::Match(1)
: MatcherResult<SimpleParserValue>::NoMatch();
}

View File

@ -0,0 +1,15 @@
#pragma once
#include "Parsing/Simple/SimpleParserValue.h"
#include "Parsing/Matcher/AbstractMatcher.h"
class SimpleMatcherMultiCharacter final : public AbstractMatcher<SimpleParserValue>
{
int m_multi_character_sequence_id;
protected:
MatcherResult<SimpleParserValue> CanMatch(ILexer<SimpleParserValue>* lexer, unsigned tokenOffset) override;
public:
explicit SimpleMatcherMultiCharacter(int multiCharacterSequenceId);
};

View File

@ -1,17 +1,77 @@
#include "SimpleLexer.h"
SimpleLexer::Config::MultiCharacterToken::MultiCharacterToken(const int id, std::string value)
: m_id(id),
m_value(std::move(value))
{
}
SimpleLexer::MultiCharacterTokenLookupEntry::MultiCharacterTokenLookupEntry(const int id, std::string value)
: m_id(id),
m_value(std::move(value))
{
}
SimpleLexer::SimpleLexer(IParserLineStream* stream)
: AbstractLexer(stream),
m_config{false, true, true},
m_config{false, true, true, {}},
m_check_for_multi_character_tokens(false),
m_last_line(1)
{
}
SimpleLexer::SimpleLexer(IParserLineStream* stream, Config config)
: AbstractLexer(stream),
m_config(config),
m_config(std::move(config)),
m_check_for_multi_character_tokens(false),
m_last_line(1)
{
for (auto tokenConfig : m_config.m_multi_character_tokens)
AddMultiCharacterTokenConfigToLookup(std::move(tokenConfig));
m_config.m_multi_character_tokens.clear();
}
void SimpleLexer::AddMultiCharacterTokenConfigToLookup(Config::MultiCharacterToken tokenConfig)
{
if (tokenConfig.m_value.empty())
return;
m_check_for_multi_character_tokens = true;
const auto firstCharacterValue = static_cast<uint8_t>(tokenConfig.m_value[0]);
if (m_multi_character_token_lookup[firstCharacterValue])
{
auto* currentEntry = m_multi_character_token_lookup[firstCharacterValue].get();
while (currentEntry->m_next)
currentEntry = currentEntry->m_next.get();
currentEntry->m_next = std::make_unique<MultiCharacterTokenLookupEntry>(tokenConfig.m_id, std::move(tokenConfig.m_value));
}
else
{
m_multi_character_token_lookup[firstCharacterValue] = std::make_unique<MultiCharacterTokenLookupEntry>(tokenConfig.m_id, std::move(tokenConfig.m_value));
}
}
bool SimpleLexer::ReadMultiCharacterToken(const MultiCharacterTokenLookupEntry* multiTokenLookup)
{
const auto& currentLine = CurrentLine();
assert(m_current_line_offset >= 1);
assert(multiTokenLookup);
assert(!multiTokenLookup->m_value.empty());
assert(currentLine.m_line[m_current_line_offset - 1] == multiTokenLookup->m_value[0]);
const char* linePos = &currentLine.m_line[m_current_line_offset - 1];
for (const auto c : multiTokenLookup->m_value)
{
if (!*linePos || *linePos != c)
return false;
linePos++;
}
m_current_line_offset = m_current_line_offset - 1 + multiTokenLookup->m_value.size();
return true;
}
SimpleParserValue SimpleLexer::GetNextToken()
@ -45,6 +105,18 @@ SimpleParserValue SimpleLexer::GetNextToken()
if (c == EOF)
return SimpleParserValue::EndOfFile(TokenPos());
if(m_check_for_multi_character_tokens)
{
const auto* multiTokenLookup = m_multi_character_token_lookup[static_cast<uint8_t>(c)].get();
while(multiTokenLookup)
{
if(ReadMultiCharacterToken(multiTokenLookup))
return SimpleParserValue::MultiCharacter(pos, multiTokenLookup->m_id);
multiTokenLookup = multiTokenLookup->m_next.get();
}
}
if (m_config.m_read_strings && c == '\"')
return SimpleParserValue::String(GetPreviousCharacterPos(), new std::string(ReadString()));

View File

@ -1,5 +1,9 @@
#pragma once
#include <cstdint>
#include <limits>
#include <memory>
#include "SimpleParserValue.h"
#include "Parsing/Impl/AbstractLexer.h"
@ -9,15 +13,41 @@ public:
class Config
{
public:
class MultiCharacterToken
{
public:
int m_id;
std::string m_value;
MultiCharacterToken(int id, std::string value);
};
bool m_emit_new_line_tokens;
bool m_read_strings;
bool m_read_numbers;
std::vector<MultiCharacterToken> m_multi_character_tokens;
};
protected:
class MultiCharacterTokenLookupEntry
{
public:
int m_id;
std::string m_value;
std::unique_ptr<MultiCharacterTokenLookupEntry> m_next;
MultiCharacterTokenLookupEntry(int id, std::string value);
};
Config m_config;
bool m_check_for_multi_character_tokens;
int m_last_line;
protected:
std::unique_ptr<MultiCharacterTokenLookupEntry> m_multi_character_token_lookup[std::numeric_limits<uint8_t>::max() + 1];
void AddMultiCharacterTokenConfigToLookup(Config::MultiCharacterToken tokenConfig);
bool ReadMultiCharacterToken(const MultiCharacterTokenLookupEntry* multiTokenLookup);
SimpleParserValue GetNextToken() override;
public:

View File

@ -27,6 +27,13 @@ SimpleParserValue SimpleParserValue::Character(const TokenPos pos, const char c)
return pv;
}
SimpleParserValue SimpleParserValue::MultiCharacter(const TokenPos pos, const int multiCharacterSequenceId)
{
SimpleParserValue pv(pos, SimpleParserValueType::MULTI_CHARACTER);
pv.m_value.multi_character_sequence_id = multiCharacterSequenceId;
return pv;
}
SimpleParserValue SimpleParserValue::Integer(const TokenPos pos, const int value)
{
SimpleParserValue pv(pos, SimpleParserValueType::INTEGER);
@ -116,6 +123,12 @@ char SimpleParserValue::CharacterValue() const
return m_value.char_value;
}
int SimpleParserValue::MultiCharacterValue() const
{
assert(m_type == SimpleParserValueType::MULTI_CHARACTER);
return m_value.multi_character_sequence_id;
}
int SimpleParserValue::IntegerValue() const
{
assert(m_type == SimpleParserValueType::INTEGER);

View File

@ -13,8 +13,9 @@ enum class SimpleParserValueType
END_OF_FILE,
NEW_LINE,
// Single character
// Character sequences
CHARACTER,
MULTI_CHARACTER,
// Generic token types
INTEGER,
@ -36,6 +37,7 @@ public:
{
char char_value;
int int_value;
int multi_character_sequence_id;
double double_value;
std::string* string_value;
} m_value;
@ -44,6 +46,7 @@ public:
static SimpleParserValue EndOfFile(TokenPos pos);
static SimpleParserValue NewLine(TokenPos pos);
static SimpleParserValue Character(TokenPos pos, char c);
static SimpleParserValue MultiCharacter(TokenPos pos, int multiCharacterSequenceId);
static SimpleParserValue Integer(TokenPos pos, int value);
static SimpleParserValue FloatingPoint(TokenPos pos, double value);
static SimpleParserValue String(TokenPos pos, std::string* stringValue);
@ -63,6 +66,7 @@ public:
_NODISCARD const TokenPos& GetPos() const override;
_NODISCARD char CharacterValue() const;
_NODISCARD int MultiCharacterValue() const;
_NODISCARD int IntegerValue() const;
_NODISCARD double FloatingPointValue() const;
_NODISCARD std::string& StringValue() const;