From 4dd77025c3d2e32a414cebe38ab7d35134f1dc0a Mon Sep 17 00:00:00 2001 From: Guilherme Werner Date: Mon, 26 Feb 2024 18:20:56 -0300 Subject: [PATCH] Add basic lexer --- Cargo.lock | 7 ++ Cargo.toml | 7 ++ src/lexer.rs | 203 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/main.rs | 18 +++++ 5 files changed, 236 insertions(+) create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/lexer.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..87607ce --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "language" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..5043fbe --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "language" +version = "0.1.0" +edition = "2021" + +[dependencies] +#inkwell = { version = "0.4.0", features = ["llvm16-0"] } diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..ee5545e --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,203 @@ +use std::iter::Peekable; +use std::ops::DerefMut; +use std::str::Chars; + +/// Represents a primitive syntax token. +#[derive(Debug, Clone)] +pub enum Token { + Comma, + Comment, + Const, + Else, + EOF, + For, + Function, + Ident(String), + If, + Import, + LParen, + Number(f64), + Op(char), + Return, + RParen, + SemiColon, + Var, +} + +/// Defines an error encountered by the `Lexer`. +pub struct LexerError { + pub error: &'static str, + pub index: usize, +} + +impl LexerError { + #[allow(unused)] + pub fn new(msg: &'static str) -> LexerError { + LexerError { + error: msg, + index: 0, + } + } + + #[allow(unused)] + pub fn with_index(msg: &'static str, index: usize) -> LexerError { + LexerError { error: msg, index } + } +} + +/// Defines the result of a lexing operation; namely a +/// `Token` on success, or a `LexerError` on failure. +pub type LexerResult = Result; + +pub struct Lexer<'a> { + input: &'a str, + chars: Box>>, + pos: usize, +} + +impl<'a> Lexer<'a> { + /// Creates a new `Lexer`, given its source `input`. + pub fn new(input: &'a str) -> Lexer<'a> { + Lexer { + input, + chars: Box::new(input.chars().peekable()), + pos: 0, + } + } + + /// Lexes and returns the next `Token` from the source code. + pub fn lex(&mut self) -> LexerResult { + let chars = self.chars.deref_mut(); + let src = self.input; + + let mut pos = self.pos; + + // Skip whitespaces + loop { + // Note: the following lines are in their own scope to + // limit how long 'chars' is borrowed, and in order to allow + // it to be borrowed again in the loop by 'chars.next()'. + { + let ch = chars.peek(); + + if ch.is_none() { + self.pos = pos; + + return Ok(Token::EOF); + } + + if !ch.unwrap().is_whitespace() { + break; + } + } + + chars.next(); + pos += 1; + } + + let start = pos; + let next = chars.next(); + + if next.is_none() { + return Ok(Token::EOF); + } + + pos += 1; + + // Actually get the next token. + let result = match next.unwrap() { + '(' => Ok(Token::LParen), + ')' => Ok(Token::RParen), + ',' => Ok(Token::Comma), + ';' => Ok(Token::SemiColon), + + '#' => { + // Comment + loop { + let ch = chars.next(); + pos += 1; + + if ch == Some('\n') { + break; + } + } + + Ok(Token::Comment) + } + + '.' | '0'..='9' => { + // Parse number literal + loop { + let ch = match chars.peek() { + Some(ch) => *ch, + None => return Ok(Token::EOF), + }; + + // Parse float. + if ch != '.' && !ch.is_ascii_hexdigit() { + break; + } + + chars.next(); + pos += 1; + } + + Ok(Token::Number(src[start..pos].parse().unwrap())) + } + + 'a'..='z' | 'A'..='Z' | '_' => { + // Parse identifier + loop { + let ch = match chars.peek() { + Some(ch) => *ch, + None => return Ok(Token::EOF), + }; + + // A word-like identifier only contains underscores and alphanumeric characters. + if ch != '_' && !ch.is_alphanumeric() { + break; + } + + chars.next(); + pos += 1; + } + + match &src[start..pos] { + "function" => Ok(Token::Function), + "import" => Ok(Token::Import), + "if" => Ok(Token::If), + "else" => Ok(Token::Else), + "for" => Ok(Token::For), + "var" => Ok(Token::Var), + "return" => Ok(Token::Return), + "const" => Ok(Token::Const), + + ident => Ok(Token::Ident(ident.to_string())), + } + } + + op => { + // Parse operator + Ok(Token::Op(op)) + } + }; + + // Update stored position, and return + self.pos = pos; + + result + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Token; + + /// Lexes the next `Token` and returns it. + /// On EOF or failure, `None` will be returned. + fn next(&mut self) -> Option { + match self.lex() { + Ok(Token::EOF) | Err(_) => None, + Ok(token) => Some(token), + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..fc84151 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1 @@ +pub mod lexer; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..2b02b3d --- /dev/null +++ b/src/main.rs @@ -0,0 +1,18 @@ +use language::lexer::{Lexer, Token}; + +fn main() { + let input = r#" + var a; + a = 1 + 2; + function add(a, b) { + return a + b; + } + "#; + + let mut lexer = Lexer::new(input); + let tokens: Vec = lexer.by_ref().collect(); + + for token in tokens { + println!("{:?}", token); + } +}