NouVeL/NVL/Parser.cpp

525 lines
14 KiB
C++
Raw Normal View History

2021-12-17 01:05:38 -05:00
#include "Parser.h"
2021-12-12 03:41:54 -05:00
#include <fstream>
#include <sstream>
#include <iostream>
2022-05-10 02:42:12 -04:00
#include <regex>
#include <utility>
2021-12-12 22:20:28 -05:00
#include <stdexcept>
2021-12-12 03:41:54 -05:00
2021-12-12 22:20:28 -05:00
#include "Environment.h"
#include <locale>
#include <codecvt>
2021-12-12 03:41:54 -05:00
namespace {
2022-08-18 12:17:43 -04:00
using namespace NVL;
2021-12-12 03:41:54 -05:00
struct ParseGroup {
String accept;
2021-12-12 22:20:28 -05:00
operator String() const {
2021-12-12 22:20:28 -05:00
return accept;
}
bool operator== (const String& other) const {
2021-12-12 22:20:28 -05:00
return accept == other;
}
2021-12-12 03:41:54 -05:00
};
struct Match {
String accept;
2021-12-17 01:05:38 -05:00
operator wchar_t() const {
2021-12-17 01:05:38 -05:00
if (accept.length() == 1)
return accept[0];
else {
std::wcerr << "Cannot demote Match " << accept << " to char" << std::endl;
2021-12-17 01:05:38 -05:00
return '\0';
}
2021-12-12 03:41:54 -05:00
}
bool operator== (const String& other) const {
2021-12-12 03:41:54 -05:00
return accept == other;
2021-05-16 12:59:18 -04:00
}
2021-12-12 03:41:54 -05:00
};
const ParseGroup NUMERIC = { L"1234567890" };
const Match DECIMAL_DOT = { L"." };
const Match NEGATIVE = { L"-" };
const ParseGroup ALPHA = { L"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" };
const Match ARRAY_OPEN = { L"[" };
const Match ARRAY_CLOSE = { L"]" };
const Match ARRAY_DELIM = { L"," };
const Match GROUP_OPEN = { L"(" };
const Match GROUP_CLOSE = { L")" };
const Match QUOTE = { L"\"" };
const Match COMMENT_BEGIN = { L"//" };
const Match DIALOGUE_OPEN = { L"<<-" };
const Match DIALOGUE_CLOSE = { L"->>" };
const Match BEGIN = { L"BEGIN" };
const Match END = { L"END" };
const ParseGroup SYMBOL = { ALPHA.accept + NUMERIC.accept + L"_"};
2021-12-17 01:05:38 -05:00
const Match SPECIAL_SYMBOLS[] = {
{ L"+" },
{ L"-" },
{ L"*" },
{ L"/" },
{ L"=?" },
{ L">?" },
{ L"<?" },
{ L"<=?" },
{ L">=?" }
2021-12-17 01:05:38 -05:00
};
const ParseGroup WS = { L" \t\v\f\r\n" };
2021-12-12 03:41:54 -05:00
const ParseGroup SEPARATOR = {
2021-12-12 22:20:28 -05:00
WS.accept +
wchar_t(ARRAY_OPEN) +
wchar_t(ARRAY_CLOSE) +
wchar_t(GROUP_OPEN) +
wchar_t(GROUP_CLOSE) +
wchar_t(ARRAY_DELIM) +
2021-12-17 01:05:38 -05:00
COMMENT_BEGIN.accept[0]
2021-12-12 03:41:54 -05:00
};
const Match NEWLINE = { L"\n" };
const ParseGroup ESCAPED = { L"\\\"" };
2021-12-12 03:41:54 -05:00
const Match ESCAPE = { L"\\" };
2021-12-13 14:04:12 -05:00
// Dialogue mode matches
const Match MARKUP_OPEN = { L"[" };
const Match MARKUP_CLOSE = { L"]" };
const Match SPEAKER_OPEN = { L"[" };
const Match SPEAKER_CLOSE = { L"]" };
const Match MARKUP_TEXT_OPEN = { L"{" };
const Match MARKUP_TEXT_CLOSE = { L"}" };
const Match TEMPLATE_IND = { L"$" };
const Match TEMPLATE_OPEN = { L"{" };
const Match TEMPLATE_CLOSE = { L"}" };
2021-12-13 14:04:12 -05:00
const Match COMMAND_ESCAPE = { L"*!" };
2021-12-13 14:04:12 -05:00
const ParseGroup DIALOGUE_ESCAPED_SINGLE = {
ESCAPE.accept +
wchar_t(MARKUP_OPEN) +
wchar_t(MARKUP_CLOSE) +
wchar_t(MARKUP_TEXT_OPEN) +
wchar_t(MARKUP_TEXT_CLOSE) +
2021-12-13 14:04:12 -05:00
// char(SPEAKER_OPEN) +
// char(SPEAKER_CLOSE) +
wchar_t(TEMPLATE_IND)
2021-12-13 14:04:12 -05:00
// char(TEMPLATE_OPEN) +
// char(TEMPLATE_CLOSE)
};
std::wstring read_file_to_string(const std::string& path) {
2021-12-12 03:41:54 -05:00
std::ifstream f(path);
2021-12-12 23:36:55 -05:00
{ // Some apps on Windows adds this signature in front of UTF-8 files when saving
char a, b, c;
2021-12-13 14:04:12 -05:00
a = f.get(); b = f.get(); c = f.get();
if (a != (char)0xEF || b != (char)0xBB || c != (char)0xBF)
2021-12-12 23:36:55 -05:00
f.seekg(0);
2021-12-13 14:04:12 -05:00
else
2021-12-12 23:36:55 -05:00
std::cerr << "Warning: Windows UTF-8 BOM skipped" << std::endl;
}
2021-12-12 03:41:54 -05:00
std::stringstream buffer;
buffer << f.rdbuf();
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.from_bytes(buffer.str());
2021-12-12 03:41:54 -05:00
}
std::vector<String> split_string_by_lines(const String& str) {
std::vector<String> lines;
2022-08-18 12:17:43 -04:00
size_t pos = 0;
size_t prev = 0;
while ((pos = str.find(NEWLINE, prev)) != String::npos) {
2021-12-12 03:41:54 -05:00
lines.push_back(str.substr(prev, pos - prev));
prev = pos + 1;
2021-05-16 12:59:18 -04:00
}
2021-12-12 03:41:54 -05:00
lines.push_back(str.substr(prev));
return lines;
}
inline bool IsNumeric(const String& str) {
2021-12-17 01:05:38 -05:00
bool negative = str[0] == NEGATIVE;
2021-12-12 03:41:54 -05:00
bool had_dot = false;
2021-12-17 01:05:38 -05:00
for (auto& c : negative ? str.substr(1) : str) {
if (NUMERIC.accept.find(c) == String::npos) {
2021-12-12 03:41:54 -05:00
if (c == DECIMAL_DOT.accept[0]) {
if (had_dot)
return false;
else
had_dot = true;
}
else
return false;
}
2021-05-19 17:02:57 -04:00
}
2021-12-17 01:05:38 -05:00
if (had_dot + negative == str.length())
return false;
2021-12-12 03:41:54 -05:00
return true;
}
inline bool ContainsOnlyWS(const String& s) {
2021-12-17 01:05:38 -05:00
for (auto& c : s) {
if (WS.accept.find(c) == String::npos)
2021-12-17 01:05:38 -05:00
return false;
2021-12-12 03:41:54 -05:00
}
2021-12-17 01:05:38 -05:00
return true;
}
2021-12-13 14:04:12 -05:00
void SkipWS(const String& f, size_t& pos) {
while (WS.accept.find(f[pos]) != String::npos)
2021-12-12 03:41:54 -05:00
pos++;
2021-05-19 17:02:57 -04:00
}
void SkipToNextLine(const String& f, size_t& pos) {
2021-12-12 03:41:54 -05:00
while (f[pos] != NEWLINE.accept[0])
pos++;
pos++;
}
2021-05-15 17:48:37 -04:00
void SkipComments(const String& f, size_t& pos) {
2021-12-12 03:41:54 -05:00
SkipWS(f, pos);
while (f.substr(pos, 2) == COMMENT_BEGIN.accept) {
SkipToNextLine(f, pos);
SkipWS(f, pos);
}
}
void SkipOverFirstChar(const String& f, size_t& pos) {
2021-12-12 22:20:28 -05:00
SkipWS(f, pos);
pos++;
}
String GetToken(const String& f, size_t& pos) {
2021-12-12 03:41:54 -05:00
SkipWS(f, pos);
auto start = pos;
while (++pos) {
if (SEPARATOR.accept.find(f[pos]) != String::npos)
break;
2021-12-17 01:05:38 -05:00
}
2021-12-12 03:41:54 -05:00
return f.substr(start, pos - start);
}
String PeekToken(const String& f, size_t pos) {
2021-12-12 03:41:54 -05:00
SkipWS(f, pos);
auto start = pos;
while (++pos) {
if (SEPARATOR.accept.find(f[pos]) != String::npos)
2021-07-16 20:27:25 -04:00
break;
2021-12-17 01:05:38 -05:00
}
2021-12-12 03:41:54 -05:00
return f.substr(start, pos - start);
}
bool IsLegalSymbolName(const String& token) {
2021-12-17 01:05:38 -05:00
for (const auto& x: SPECIAL_SYMBOLS) {
if (token == x.accept)
return true;
}
if (ALPHA.accept.find(token[0]) == String::npos)
2021-12-12 03:41:54 -05:00
return false;
for (auto& i : token)
if (SYMBOL.accept.find(i) == String::npos)
2021-12-12 03:41:54 -05:00
return false;
return true;
}
Parse::Object ParseExpression(const String& f, size_t& pos);
Parse::Object ParseArray(const String& f, size_t& pos, u32 layer) {
2021-12-12 22:20:28 -05:00
SkipComments(f, pos);
2021-12-12 03:41:54 -05:00
2022-08-18 12:17:43 -04:00
std::vector<Parse::Object> array{};
2021-12-12 22:20:28 -05:00
array.push_back(ParseExpression(f, pos));
while (PeekToken(f, pos)[0] != ARRAY_CLOSE) {
if (PeekToken(f, pos)[0] == ARRAY_DELIM)
SkipOverFirstChar(f, pos);
else
throw std::runtime_error("Invalid array member");
array.push_back(ParseExpression(f, pos));
2021-12-12 03:41:54 -05:00
}
2022-08-18 12:17:43 -04:00
return { Parse::Type::Array, array };
2021-12-12 03:41:54 -05:00
}
String ParseString(const String& f, size_t& pos) {
2021-12-12 22:20:28 -05:00
SkipComments(f, pos);
2022-08-18 12:17:43 -04:00
std::vector<size_t> discards{};
2021-12-12 03:41:54 -05:00
auto start = ++pos; // skip opening quote
do {
2021-12-12 22:20:28 -05:00
if (f[pos] == QUOTE) {
break;
2021-05-14 20:44:09 -04:00
}
2021-12-12 22:20:28 -05:00
else if (f[pos] == ESCAPE) {
if (ESCAPED.accept.find(f[pos]) != String::npos) {
2021-12-12 03:41:54 -05:00
discards.push_back(pos++);
}
else
throw std::runtime_error("Unrecognized escape sequence");
}
2021-12-12 22:20:28 -05:00
else if (f[pos] == NEWLINE) {
2021-12-12 03:41:54 -05:00
throw std::runtime_error("Unclosed String");
}
} while (pos++);
auto str = f.substr(start, pos++ - start);
2022-08-18 12:17:43 -04:00
for (size_t i = 0; i < discards.size(); i++) {
2021-12-12 03:41:54 -05:00
str.erase(discards[i] - start - i, 1);
}
2021-12-12 03:41:54 -05:00
return str;
}
u32 GetProcedureArity(const String& key) {
2022-08-18 12:17:43 -04:00
return Environment::ENVIRONMENT.get(key).length;
2021-12-12 22:20:28 -05:00
}
2021-05-14 23:17:55 -04:00
Parse::Command ParseCommand(const String& f, size_t& pos) {
2021-12-12 22:20:28 -05:00
SkipComments(f, pos);
2021-12-12 03:41:54 -05:00
auto proc = GetToken(f, pos);
2021-12-12 22:20:28 -05:00
if (!IsLegalSymbolName(proc)) throw std::runtime_error("Illegal Procedure name");
2022-08-18 12:17:43 -04:00
Parse::Command c{ Parse::Object{ Parse::Type::Symbol, proc } };
for (u32 i = 0; i < GetProcedureArity(proc); i++) {
2021-12-12 22:20:28 -05:00
c.push_back(ParseExpression(f, pos));
2022-08-18 12:17:43 -04:00
}
2021-12-12 22:20:28 -05:00
2021-12-12 23:29:53 -05:00
SkipComments(f, pos);
2021-12-12 22:20:28 -05:00
return c;
}
Parse::Object ParseExpression(const String& f, size_t& pos) {
2021-12-12 22:20:28 -05:00
SkipComments(f, pos);
auto t = PeekToken(f, pos);
if (t[0] == ARRAY_OPEN) {
SkipOverFirstChar(f, pos);
auto c = ParseArray(f, pos, 0);
if (PeekToken(f, pos)[0] != ARRAY_CLOSE)
throw std::runtime_error("Cannot match closing Array");
else
SkipOverFirstChar(f, pos);
return c;
}
else if (t[0] == GROUP_OPEN) {
SkipOverFirstChar(f, pos);
auto c = ParseCommand(f, pos);
if (PeekToken(f, pos)[0] != GROUP_CLOSE)
throw std::runtime_error("Cannot match closing subexpression");
else
SkipOverFirstChar(f, pos);
2022-08-18 12:17:43 -04:00
return Parse::Object{ Parse::Type::Subexpression, c };
2021-05-14 20:44:09 -04:00
}
2021-12-12 22:20:28 -05:00
else if (t[0] == GROUP_CLOSE)
throw std::runtime_error("Cannot match closing subexpression, likely too few arguments");
else if (t[0] == QUOTE)
2022-08-18 12:17:43 -04:00
return { Parse::Type::String, ParseString(f, pos) };
2021-12-12 22:20:28 -05:00
else if (t[0] == ARRAY_CLOSE)
throw std::runtime_error("Cannot match closing array");
else {
auto token = GetToken(f, pos);
if (IsNumeric(token))
2022-08-18 12:17:43 -04:00
return { std::stof(token) };
2021-12-12 22:20:28 -05:00
else if (IsLegalSymbolName(token))
2022-08-18 12:17:43 -04:00
return { Parse::Type::Symbol, token };
2021-12-12 22:20:28 -05:00
else
throw std::runtime_error("Illegal symbol");
}
}
2021-12-12 03:41:54 -05:00
2022-05-10 02:42:12 -04:00
/*
* NVL Markup Parsetree
*
* * - Vec:2 - Num: Markup begin
* | |
* | - Num: Markup end
* |
* - Vec:2 - Str: T of Markup
* |
* OR
* |
* - Vec:2 - Str: T of Markup
* |
* - Vec: Params
*/
Parse::Object MatchMarkup(String& s) {
2022-05-10 02:42:12 -04:00
static const std::regex typer(R"((?:^|[^\\])\[([^\]]+)\]\s*\{([^\}]+)\})"); // G1 -> Specifiers, G2 -> Contents
static const std::regex effect(R"(\s*(?:([^,\(\)]+?)\s*\(\s*([^\(\)]+?)\s*\)|([^,\(\)]+?))\s*(?:,\s*|$))"); // G1 & G2 -> Func, G3 -> Attr
static const std::regex param(R"(([^,]+?)\s*(?:,\s*|$))"); // Comma split of func args
2022-08-18 12:17:43 -04:00
std::vector<Parse::Object> tags;
2022-05-10 02:42:12 -04:00
std::wsmatch tags_match;
std::wsmatch effects_match;
std::wsmatch params_match;
2022-05-10 02:42:12 -04:00
String reconstruction{};
2022-05-10 02:42:12 -04:00
size_t running_offset = 0; // to account for characters removed
bool has_markup{};
String::const_iterator tags_start(s.cbegin());
2022-05-10 02:42:12 -04:00
while (std::regex_search(tags_start, s.cend(), tags_match, typer)) {
has_markup = true;
2022-08-18 12:17:43 -04:00
Parse::Object m{ Parse::Type::Array, std::vector<Parse::Object>{} };
reconstruction.append(tags_match.prefix().first, std::next(tags_match[0].first, 1)); // match will overmatch behind by 1
2022-05-10 02:42:12 -04:00
reconstruction.append(tags_match[2].first, tags_match[2].second);
running_offset += tags_match[2].first - std::next(tags_match[0].first, 1);
2022-08-18 12:17:43 -04:00
size_t begin = tags_match[2].first - s.cbegin() - running_offset;
Parse::Object range{ Parse::Type::Array, { std::vector<Parse::Object>{
{ static_cast<Number>(begin) },
{ static_cast<Number>(begin + (tags_match[2].second - tags_match[2].first)) }
2022-05-10 02:42:12 -04:00
}
} };
2022-08-18 12:17:43 -04:00
std::vector<Parse::Object> effects{};
2022-05-10 02:42:12 -04:00
String::const_iterator effects_start(tags_match[1].first);
2022-05-10 02:42:12 -04:00
while (std::regex_search(effects_start, tags_match[1].second, effects_match, effect)) {
if (effects_match[3].matched) {
2022-08-18 12:17:43 -04:00
effects.push_back({ Parse::Type::String, effects_match[3].str() });
2022-05-10 02:42:12 -04:00
}
else {
2022-08-18 12:17:43 -04:00
std::vector<Parse::Object> args;
String::const_iterator params_start(effects_match[2].first);
2022-05-10 02:42:12 -04:00
while (std::regex_search(params_start, effects_match[2].second, params_match, param)) {
size_t temp = 0;
2022-08-18 12:17:43 -04:00
args.push_back(ParseExpression(params_match[1].str() + SEPARATOR.accept[0], temp)); // PeekToken will freak out if I don't do this
2022-05-10 02:42:12 -04:00
params_start = params_match.suffix().first;
}
2022-08-18 12:17:43 -04:00
effects.push_back({ Parse::Type::Array, std::vector<Parse::Object>{ { Parse::Type::String, effects_match[1].str() }, { Parse::Type::Array, args } } });
2022-05-10 02:42:12 -04:00
}
effects_start = effects_match.suffix().first;
}
running_offset += tags_match[0].second - tags_match[2].second;
tags_start = tags_match.suffix().first;
2022-08-18 12:17:43 -04:00
tags.push_back({ Parse::Type::Array, std::vector<Parse::Object>{ range, { Parse::Type::Array, effects } } });
2022-05-10 02:42:12 -04:00
}
if (has_markup) {
reconstruction.append(tags_match.suffix().first, tags_match.suffix().second);
s = reconstruction;
2022-08-18 12:17:43 -04:00
return { Parse::Type::Array, tags };
2022-05-10 02:42:12 -04:00
}
else {
2022-08-18 12:17:43 -04:00
return { Parse::Type::Array, std::vector<Parse::Object>{} };
2022-05-10 02:42:12 -04:00
}
}
Parse::Command ParseDialogue(const String& s) {
2021-12-13 14:04:12 -05:00
if (s.substr(0, 2) == COMMAND_ESCAPE.accept) {
size_t dummy = 0;
2021-12-17 01:05:38 -05:00
// Pad a space towards the end, the helpers do not expect strings to immediately terminate
return ParseCommand(s.substr(2) + L" ", dummy);
2021-12-13 14:04:12 -05:00
}
2022-08-18 12:17:43 -04:00
// assume arity for SwitchSpeaker and Say
2021-12-17 01:05:38 -05:00
2021-12-13 14:04:12 -05:00
if (s.back() == SPEAKER_CLOSE) {
if (s.front() == SPEAKER_OPEN) {
auto name = s.substr(1, s.length() - 2);
// if (IsLegalSymbolName(name))
return { { Parse::Type::Symbol, L"SwitchSpeaker" }, { Parse::Type::String, name } };
2021-12-13 14:04:12 -05:00
}
else
throw std::runtime_error("Malformed speaker command");
}
String copy{ s };
2022-08-18 12:17:43 -04:00
Parse::Object tags = MatchMarkup(copy); // THIS WILL MODIFY COPY
2022-05-10 02:42:12 -04:00
return { { Parse::Type::Symbol, L"Say" }, { Parse::Type::String, copy }, tags };
2021-05-14 11:47:54 -04:00
}
2021-05-15 17:48:37 -04:00
Parse::Scene ParseScene(const String& f, size_t& pos) {
2021-12-12 03:41:54 -05:00
SkipComments(f, pos);
if (!(GetToken(f, pos) == BEGIN.accept))
throw std::runtime_error("Could not match accept at root");
auto scene_name = GetToken(f, pos);
2021-12-12 22:20:28 -05:00
if (!IsLegalSymbolName(scene_name)) throw std::runtime_error("Illegal Scene name");
2022-08-18 12:17:43 -04:00
Parse::Scene s{ scene_name };
2021-12-12 03:41:54 -05:00
2021-12-12 22:20:28 -05:00
bool dialogue_mode = false;
2021-12-12 03:41:54 -05:00
while (PeekToken(f, pos) != END.accept) {
2021-12-12 22:20:28 -05:00
if (!dialogue_mode)
if (PeekToken(f, pos) == DIALOGUE_OPEN.accept) {
dialogue_mode = true;
GetToken(f, pos); // skip DIALOGUE_OPEN
SkipComments(f, pos);
}
else
s.append(ParseCommand(f, pos));
else {
auto end = f.find(NEWLINE.accept + DIALOGUE_CLOSE.accept, pos);
if (end == String::npos)
2021-12-12 22:20:28 -05:00
throw std::runtime_error("Dialogue does not terminate");
auto lines = split_string_by_lines(f.substr(pos, end - pos));
for (auto& l : lines) {
2021-12-17 01:05:38 -05:00
if (!l.empty() && !ContainsOnlyWS(l))
s.append(ParseDialogue(l));
2021-12-12 22:20:28 -05:00
}
dialogue_mode = false;
pos = end;
GetToken(f, pos); // skip DIALOGUE_CLOSE
SkipComments(f, pos);
}
2021-07-16 20:27:25 -04:00
}
2021-05-14 16:51:04 -04:00
2021-12-12 22:20:28 -05:00
if (dialogue_mode)
throw std::runtime_error("Illegal Scene end");
2021-12-12 03:41:54 -05:00
GetToken(f, pos); // skip END
2021-05-14 16:51:04 -04:00
2021-12-17 01:05:38 -05:00
SkipComments(f, pos);
2021-12-12 22:20:28 -05:00
return s;
2021-12-12 03:41:54 -05:00
}
}
2021-05-15 17:48:37 -04:00
2021-12-17 01:05:38 -05:00
namespace NVL::Parse {
2022-08-18 12:17:43 -04:00
Object::Object(const Number& v) : type(Type::Number), value(v) { }
Object::Object(Type t, const String& v) : type(t), value(v) {
2022-08-18 12:17:43 -04:00
if (t != Type::String && t != Type::Symbol) throw std::runtime_error("Bad type when constructing object!");
}
Object::Object(Type t, const std::vector<Object>& v) : type(t), value(v) {
if (t != Type::Array && t != Type::Subexpression) throw std::runtime_error("Bad type when constructing object!");
}
Object::Object(Number&& v) : type(Type::Number), value(std::move(v)) {
}
Object::Object(Type t, String&& v) : type(t), value(std::move(v)) {
2022-08-18 12:17:43 -04:00
if (t != Type::String && t != Type::Symbol) throw std::runtime_error("Bad type when constructing object!");
}
Object::Object(Type t, std::vector<Object>&& v) : type(t), value(std::move(v)) {
if (t != Type::Array && t != Type::Subexpression) throw std::runtime_error("Bad type when constructing object!");
}
2021-12-17 01:05:38 -05:00
std::vector<Scene> ParseFile(const std::string& path) {
std::wstring f = read_file_to_string(path);
2021-05-14 11:47:54 -04:00
2021-12-12 03:41:54 -05:00
std::vector<Scene> list {}; // Vector of scenes which each contain a vector of Parses
for (size_t i = 0; i < f.length(); i++) {
2021-12-12 22:20:28 -05:00
list.push_back(ParseScene(f, i));
2021-05-14 16:51:04 -04:00
}
2021-07-16 20:27:25 -04:00
2021-12-17 01:05:38 -05:00
return list;
2021-07-16 20:27:25 -04:00
}
2021-05-14 17:21:10 -04:00
}
2021-12-12 03:41:54 -05:00