You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

381 lines
13 KiB

  1. #include <map>
  2. #include <optional>
  3. #include <sstream>
  4. #include <iostream>
  5. #include <array>
  6. #include <charconv>
  7. #include <utility>
  8. #include <algorithm>
  9. #include <limits>
  10. #include "UserScript/parser.h"
  11. #include "UserScript.h"
  12. /////////////////
  13. /// CONSTANTS ///
  14. /////////////////
  15. using symbol_t = scripting::ast::symbol_t;
  16. constexpr std::array<std::pair<std::string_view, symbol_t>, 25> operators {
  17. std::pair<std::string_view, symbol_t>{"(", symbol_t::l_paren},
  18. std::pair<std::string_view, symbol_t>{")", symbol_t::r_paren},
  19. std::pair<std::string_view, symbol_t>{"!=", symbol_t::different},
  20. std::pair<std::string_view, symbol_t>{"!", symbol_t::logical_not},
  21. std::pair<std::string_view, symbol_t>{"~", symbol_t::binary_not},
  22. std::pair<std::string_view, symbol_t>{"/", symbol_t::divide},
  23. std::pair<std::string_view, symbol_t>{"%", symbol_t::modulo},
  24. std::pair<std::string_view, symbol_t>{"*", symbol_t::multiply},
  25. std::pair<std::string_view, symbol_t>{"-", symbol_t::subtract},
  26. std::pair<std::string_view, symbol_t>{"+", symbol_t::add},
  27. std::pair<std::string_view, symbol_t>{"<<<", symbol_t::rotate_left},
  28. std::pair<std::string_view, symbol_t>{">>>", symbol_t::rotate_right},
  29. std::pair<std::string_view, symbol_t>{"<<", symbol_t::bitshift_left},
  30. std::pair<std::string_view, symbol_t>{">>", symbol_t::bitshift_right},
  31. std::pair<std::string_view, symbol_t>{"<=", symbol_t::less_or_equal_than},
  32. std::pair<std::string_view, symbol_t>{">=", symbol_t::greater_or_equal_than},
  33. std::pair<std::string_view, symbol_t>{"<", symbol_t::less_than},
  34. std::pair<std::string_view, symbol_t>{">", symbol_t::greater_than},
  35. std::pair<std::string_view, symbol_t>{"==", symbol_t::equals},
  36. std::pair<std::string_view, symbol_t>{"&&", symbol_t::logical_and},
  37. std::pair<std::string_view, symbol_t>{"&", symbol_t::binary_and},
  38. std::pair<std::string_view, symbol_t>{"||", symbol_t::logical_or},
  39. std::pair<std::string_view, symbol_t>{"|", symbol_t::binary_or},
  40. std::pair<std::string_view, symbol_t>{"^", symbol_t::binary_xor},
  41. std::pair<std::string_view, symbol_t>{"\n", symbol_t::new_line}
  42. };
  43. const std::vector<std::string_view> reserved_character_sequences {
  44. "(",
  45. ")",
  46. "!=",
  47. "!",
  48. "~",
  49. "/",
  50. "%",
  51. "*",
  52. "-",
  53. "+",
  54. "<<<",
  55. ">>>",
  56. "<<",
  57. ">>",
  58. "<=",
  59. ">=",
  60. "<",
  61. ">",
  62. "==",
  63. "&&",
  64. "&",
  65. "||",
  66. "|",
  67. "^",
  68. "=",
  69. "\n"
  70. };
  71. using token = scripting::ast::token;
  72. /////////////////////
  73. /// LEXER HELPERS ///
  74. /////////////////////
  75. struct lex_token_result {
  76. token tok;
  77. std::string_view rest;
  78. };
  79. struct rune_ref {
  80. std::string_view str;
  81. explicit operator uint32_t() const {
  82. if(str.empty()) return 0;
  83. if(str.size() == 1) return str[0];
  84. auto bytes = 8 - (str.size() + 1);
  85. uint32_t rune = static_cast<const uint8_t>(str[0]) & (1 << (bytes - 1));
  86. for(auto c : str.substr(1)) {
  87. rune <<= 6;
  88. rune ^= static_cast<const uint8_t>(c) & 0b00111111;
  89. }
  90. return rune;
  91. }
  92. [[nodiscard]] bool is_space() const {
  93. constexpr std::array<uint32_t, 19> spaces{
  94. 0x0020, 0x00A0, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
  95. 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F, 0x2002, 0x205F, 0x3000
  96. };
  97. return std::find(spaces.begin(), spaces.end(), static_cast<uint32_t>(*this)) != spaces.end();
  98. }
  99. };
  100. struct try_rune_result {
  101. rune_ref rune;
  102. std::string_view rest;
  103. };
  104. std::shared_ptr<scripting::code_location> get_loc(std::string_view original, std::string_view rest, std::shared_ptr<const std::string> last_line) {
  105. // TODO: Check everything again for weird ass cases
  106. if(original.empty()) {
  107. return std::make_shared<scripting::code_location>(scripting::code_location{
  108. .line_contents = std::make_shared<std::string>(),
  109. .line_number = (int32_t)std::clamp<size_t>(1, 1, std::numeric_limits<int32_t>::max()),
  110. .column_number = (int32_t)std::clamp<size_t>(1 + 1, 1, std::numeric_limits<int32_t>::max())
  111. });
  112. }
  113. const auto before = original.substr(0, original.size() - rest.size());
  114. const auto line_no = std::ranges::count(before, '\n') + 1;
  115. const auto line_start = std::find(before.crbegin(), before.crend(), '\n');
  116. const auto column_no = line_start != before.crend() ? (line_start - before.crbegin()) : before.size();
  117. const auto back_tracked = before.size() - column_no;
  118. const auto front_tracked = rest.empty() ? original.size() : before.size() + (std::ranges::find(rest, '\n') - rest.begin());
  119. const std::string_view current{original.begin() + back_tracked, original.begin() + front_tracked};
  120. if(not last_line || *last_line != current) {
  121. last_line = std::make_shared<std::string>(current);
  122. }
  123. return std::make_shared<scripting::code_location>(scripting::code_location{
  124. .line_contents = last_line,
  125. .line_number = (int32_t)std::clamp<size_t>(line_no, 1, std::numeric_limits<int32_t>::max()),
  126. .column_number = (int32_t)std::clamp<size_t>(column_no + 1, 1, std::numeric_limits<int32_t>::max())
  127. });
  128. }
  129. ////////////////////
  130. /// LEXER PROPER ///
  131. ////////////////////
  132. auto try_rune(std::string_view text, std::shared_ptr<scripting::code_location>& location, std::vector<scripting::script_error>& errors) -> std::optional<try_rune_result> {
  133. static_assert(CHAR_BIT == 8, "Get your weird ass cpu outta here");
  134. if(text.empty()) return std::nullopt;
  135. if(0 == (*reinterpret_cast<const uint8_t*>(&text.front()) & 0b10000000)) {
  136. return try_rune_result{text.substr(0, 1), text.substr(1)};
  137. }
  138. switch(auto bytes = std::countl_one(*reinterpret_cast<const uint8_t*>(&text.front())); bytes) {
  139. case 0: // ASCII
  140. {
  141. return try_rune_result{text.substr(0, 1), text.substr(1)};
  142. }
  143. case 1: // Middle of sequence
  144. {
  145. return std::nullopt;
  146. }
  147. case 7: [[fallthrough]];
  148. case 8: // Invalid sequence start
  149. {
  150. return std::nullopt;
  151. }
  152. default: // Maybe it is valid
  153. {
  154. if(text.size() < bytes) { // Nope, too short to get a full rune
  155. errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"});
  156. return std::nullopt;
  157. }
  158. auto rune = text.substr(0,bytes);
  159. // Check if the rest of the rune is valid
  160. if(std::ranges::any_of(rune.substr(1), [](const char& byte){ return std::countl_one(*reinterpret_cast<const uint8_t*>(&byte)) != 1;})) {
  161. errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"});
  162. return std::nullopt;
  163. }
  164. return try_rune_result{rune, text.substr(bytes)};
  165. }
  166. }
  167. }
  168. constexpr auto try_string = [](std::string_view view, std::shared_ptr<scripting::code_location>& location, std::vector<scripting::script_error>& errors) -> std::optional<lex_token_result> {
  169. constexpr std::array<int8_t, 256> hexdigits = {
  170. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  171. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  172. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  173. +0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
  174. -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  175. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  176. -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  177. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  178. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  179. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  180. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  181. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  182. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  183. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  184. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  185. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  186. };
  187. auto it = view.begin();
  188. while (it != view.end() and std::isspace(*it)) ++it;
  189. if(it == view.end()) return std::nullopt;
  190. std::stringstream generated;
  191. if(*it != '"') return std::nullopt;
  192. std::string str;
  193. while(true) {
  194. ++it;
  195. if(it == view.end()) {
  196. errors.push_back(scripting::script_error{.location = location, .message = "Unterminated string"});
  197. return std::nullopt;
  198. }
  199. switch(*it) {
  200. case '\\':
  201. ++it;
  202. if(it == view.end()) {
  203. errors.push_back(scripting::script_error{.location = location, .message = "Unterminated string"});
  204. }
  205. switch(*it) {
  206. case '\\': generated << '\\'; break;
  207. case 'a': generated << '\a'; break;
  208. case 'b': generated << '\b'; break;
  209. case 'f': generated << '\f'; break;
  210. case 'n': generated << '\n'; break;
  211. case 'r': generated << '\r'; break;
  212. case 't': generated << '\t'; break;
  213. case 'v': generated << '\v'; break;
  214. case '\'': generated << '\''; break;
  215. case '"': generated << '"'; break;
  216. case '0': [[fallthrough]];
  217. case '1': [[fallthrough]];
  218. case '2': [[fallthrough]];
  219. case '3': [[fallthrough]];
  220. case '4': [[fallthrough]];
  221. case '5': [[fallthrough]];
  222. case '6': [[fallthrough]];
  223. case '7':
  224. {
  225. char c = uint8_t(*it - '0') * 8 * 8;
  226. if(uint8_t(*it - '0') > 8) {
  227. errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"});
  228. }
  229. ++it;
  230. if(it == view.end()) return std::nullopt;
  231. c += uint8_t(*it - '0') * 8;
  232. if(uint8_t(*it - '0') > 8) {
  233. errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"});
  234. }
  235. ++it;
  236. if(it == view.end()) return std::nullopt;
  237. c += uint8_t(*it - '0');
  238. if(uint8_t(*it - '0') > 8) {
  239. errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"});
  240. }
  241. generated << c;
  242. break; }
  243. case 'x':
  244. {
  245. ++it;
  246. if(it == view.end()) return std::nullopt;
  247. if(hexdigits[*it] < 0) return std::nullopt;
  248. char c = hexdigits[*it] << 4;
  249. ++it;
  250. if(it == view.end()) return std::nullopt;
  251. if(hexdigits[*it] < 0) return std::nullopt;
  252. c += hexdigits[*it];
  253. generated << c;
  254. break; }
  255. default:
  256. generated << *it;
  257. }
  258. break;
  259. case '"':
  260. str = generated.str();
  261. return lex_token_result {
  262. token{.location = location, .value = std::string(str)},
  263. std::string_view(++it, view.end())
  264. };
  265. default:
  266. generated << *it;
  267. break;
  268. }
  269. }
  270. };
  271. constexpr auto try_int32 = [](std::string_view view, std::shared_ptr<scripting::code_location>& location, std::vector<scripting::script_error>& errors) -> std::optional<lex_token_result> {
  272. int32_t i;
  273. auto v = std::from_chars(view.begin(), view.end(), i);
  274. if(v.ptr == view.begin()) return std::nullopt;
  275. auto rest = std::string_view(v.ptr, view.end());
  276. return lex_token_result{
  277. token{.location = std::move(location), .value = i},
  278. rest
  279. };
  280. };
  281. std::optional<lex_token_result> try_operator(std::string_view code, std::shared_ptr<scripting::code_location>& location, std::vector<scripting::script_error>& errors) {
  282. for(auto& [representation, type] : operators) {
  283. if(code.starts_with(representation)) {
  284. return lex_token_result{
  285. token{.location = location, .value = type},
  286. code.substr(representation.size())
  287. };
  288. }
  289. }
  290. return std::nullopt;
  291. }
  292. auto try_identifier(std::string_view view, std::shared_ptr<scripting::code_location>& location, std::vector<scripting::script_error>& errors) -> std::optional<lex_token_result> {
  293. constexpr auto starts_with_reserved = [](std::string_view v) -> bool {
  294. return std::ranges::any_of(reserved_character_sequences, [&](auto seq){
  295. return v.starts_with(seq);
  296. });
  297. };
  298. std::stringstream identifier_value;
  299. if(view.empty()) return std::nullopt;
  300. while(!view.empty() && !starts_with_reserved(view)) {
  301. if(auto maybe_rune = try_rune(view, location, errors); maybe_rune) {
  302. auto [rune, rest] = maybe_rune.value();
  303. if(rune.is_space()) {
  304. view = rest;
  305. break;
  306. }
  307. identifier_value << rune.str;
  308. view = rest;
  309. } else {
  310. errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"});
  311. return std::nullopt;
  312. }
  313. }
  314. scripting::ast::identifier result {.location = location, .value = identifier_value.str()};
  315. if(result.value.empty()) return std::nullopt;
  316. return lex_token_result{.tok = token{.location = location, .value = result}, .rest = view};
  317. }
  318. std::vector<token> scripting::ast::lex(const std::string& code, std::vector<scripting::script_error>& errors) {
  319. std::vector<token> return_value;
  320. std::string_view current = code;
  321. std::shared_ptr<const std::string> last_line;
  322. while(not current.empty()) {
  323. for(;;) {
  324. if(current.empty()) break;
  325. auto location = get_loc(code, current, last_line);
  326. auto c = try_rune(current, location, errors);
  327. if(not c.has_value()) {
  328. errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 encoding detected while trimming space"});
  329. return return_value;
  330. } else {
  331. if(c.value().rune.is_space()) {
  332. current = c.value().rest;
  333. } else break;
  334. }
  335. }
  336. auto location = get_loc(code, current, last_line);
  337. last_line = location->line_contents;
  338. auto res = try_string(current, location, errors);
  339. if (!res) res = try_operator(current, location, errors);
  340. if (!res) res = try_int32(current, location, errors);
  341. if (!res) res = try_identifier(current, location, errors);
  342. if(res.has_value()) {
  343. current = res.value().rest;
  344. return_value.emplace_back(std::move(res.value().tok));
  345. } else {
  346. errors.push_back(scripting::script_error{.location = location, .message = "Unknown token"});
  347. return return_value;
  348. }
  349. }
  350. return return_value;
  351. }