From 7970cb197ad19f3f68afb97cc02f13c3582d3144 Mon Sep 17 00:00:00 2001 From: Parker TenBroeck <51721964+ParkerTenBroeck@users.noreply.github.com> Date: Thu, 18 Dec 2025 21:49:13 -0500 Subject: [PATCH] first --- .gitignore | 1 + Cargo.lock | 7 ++ Cargo.toml | 6 ++ default.nix | 35 +++++++ example.txt | 17 +++ src/dfa.rs | 12 +++ src/lexer.rs | 215 ++++++++++++++++++++++++++++++++++++++ src/lib.rs | 11 ++ src/main.rs | 7 ++ src/parser.rs | 284 ++++++++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 595 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 default.nix create mode 100644 example.txt create mode 100644 src/dfa.rs create mode 100644 src/lexer.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/parser.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..70333d6 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "automata" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..b7008b4 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "automata" +version = "0.1.0" +edition = "2024" + +[dependencies] diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..9de37f8 --- /dev/null +++ b/default.nix @@ -0,0 +1,35 @@ +{ pkgs ? import {} }: + pkgs.mkShell rec { + buildInputs = with pkgs; [ + clang + # Replace llvmPackages with llvmPackages_X, where X is the latest LLVM version (at the time of writing, 16) + llvmPackages.bintools + rustup + nasm + ]; + RUSTC_VERSION = "nightly"; + # https://github.com/rust-lang/rust-bindgen#environment-variables + LIBCLANG_PATH = pkgs.lib.makeLibraryPath [ pkgs.llvmPackages_latest.libclang.lib ]; + shellHook = '' + export PATH=$PATH:''${CARGO_HOME:-~/.cargo}/bin + export PATH=$PATH:''${RUSTUP_HOME:-~/.rustup}/toolchains/$RUSTC_VERSION-x86_64-unknown-linux-gnu/bin/ + ''; + # Add precompiled library to rustc search path + RUSTFLAGS = (builtins.map (a: ''-L ${a}/lib'') [ + # add libraries here (e.g. pkgs.libvmi) + ]); + # Add glibc, clang, glib and other headers to bindgen search path + BINDGEN_EXTRA_CLANG_ARGS = + # Includes with normal include path + (builtins.map (a: ''-I"${a}/include"'') [ + # add dev libraries here (e.g. pkgs.libvmi.dev) + pkgs.glibc.dev + ]) + # Includes with special directory paths + ++ [ + ''-I"${pkgs.llvmPackages_latest.libclang.lib}/lib/clang/${pkgs.llvmPackages_latest.libclang.version}/include"'' + ''-I"${pkgs.glib.dev}/include/glib-2.0"'' + ''-I${pkgs.glib.out}/lib/glib-2.0/include/'' + ]; + + } diff --git a/example.txt b/example.txt new file mode 100644 index 0000000..d0526f8 --- /dev/null +++ b/example.txt @@ -0,0 +1,17 @@ +Q = {q0, q1} // states +E = {a, b} // alphabet +T = {z0, A, B} // stack symbols + +// construct all possible permutations of A's and B's +d(q0, epsilon, z0)={(q0, A z0), (q0, B z0)} +d(q0, epsilon, A)={(q0, A A), (q0, B A)} +d(q0, epsilon, B)={(q0, A B), (q0, B B)} + +// transition to q1 +d(q0, epsilon, z0)={(q1, z0)} +d(q0, epsilon, A)={(q1, A)} +d(q0, epsilon, B)={(q1, B)} + +// consume stack until empty +d(q1, a, A)={(q1, epsilon)} +d(q1, b, B)={(q1, epsilon)} \ No newline at end of file diff --git a/src/dfa.rs b/src/dfa.rs new file mode 100644 index 0000000..8fb3bb0 --- /dev/null +++ b/src/dfa.rs @@ -0,0 +1,12 @@ +use crate::*; + +pub struct TransitionTable { + initial: State, + state_names: Vec, + transitions: Vec>, + final_states: Vec, +} + +pub struct DFA { + state: State, +} diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..531c918 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,215 @@ +#[derive(Clone, Copy, Hash, PartialEq, Eq, Debug)] +pub enum Token<'a> { + LPar, + RPar, + + LBrace, + RBrace, + + LBracket, + RBracket, + + Tilde, + Eq, + Comma, + + Or, + Plus, + Star, + And, + + LSmallArrow, + LBigArrow, + + Comment(&'a str), + + Ident(&'a str), +} + +impl<'a> std::fmt::Display for Token<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Token::LPar => write!(f, ")"), + Token::RPar => write!(f, "("), + Token::LBrace => write!(f, "{{"), + Token::RBrace => write!(f, "}}"), + Token::LBracket => write!(f, "["), + Token::RBracket => write!(f, "]"), + Token::Tilde => write!(f, "~"), + Token::Eq => write!(f, "="), + Token::Comma => write!(f, ","), + Token::Or => write!(f, "|"), + Token::Plus => write!(f, "+"), + Token::Star => write!(f, "*"), + Token::And => write!(f, "&"), + Token::LSmallArrow => write!(f, "->"), + Token::LBigArrow => write!(f, "=>"), + Token::Comment(_) => write!(f, ""), + Token::Ident(ident) if f.alternate() => write!(f, "{ident:?}"), + Token::Ident(_) => write!(f, "ident"), + } + } +} + +#[derive(Clone, Copy, Hash, PartialEq, Eq, Debug)] +pub struct Span(pub usize, pub usize); +impl Span { + pub fn join(&self, end: Span) -> Span { + Span(self.0, end.1) + } +} + +#[derive(Clone, Copy, Hash, PartialEq, Eq, Debug)] +pub struct Spanned(pub T, pub Span); +impl Spanned { + pub fn map(self, map: impl Fn(T) -> R) -> Spanned { + Spanned(map(self.0), self.1) + } +} + +#[derive(Clone, Copy, Debug)] +pub struct Lexer<'a> { + input: &'a str, + + start: usize, + position: usize, +} + +#[derive(Clone, Copy, Hash, PartialEq, Eq, Debug)] +pub enum Error { + InvalidChar(char), + UnclosedMultiLine, +} + +impl<'a> Lexer<'a> { + pub fn new(input: &'a str) -> Self { + Self { + input, + start: 0, + position: 0, + } + } + + fn consume(&mut self) -> Option { + let next = self.input.get(self.position..)?.chars().next()?; + self.position += next.len_utf8(); + Some(next) + } + + fn peek(&mut self) -> Option { + self.input.get(self.position..)?.chars().next() + } + + fn backtrack(&mut self) { + if let Some(consumed) = self.input.get(..self.position) + && let Some(previous) = consumed.chars().next_back() + { + self.position -= previous.len_utf8(); + } + } + + pub fn eof_span(&self) -> Span { + Span(self.input.len(), self.input.len()) + } +} + +impl<'a> std::iter::Iterator for Lexer<'a> { + type Item = Spanned, Error>>; + + fn next(&mut self) -> Option { + while let Some(c) = self.peek() + && c.is_whitespace() + { + self.consume(); + } + self.start = self.position; + + let res = match self.consume()? { + '(' => Ok(Token::LPar), + ')' => Ok(Token::LPar), + '{' => Ok(Token::LBrace), + '}' => Ok(Token::RBrace), + '[' => Ok(Token::LBracket), + ']' => Ok(Token::RBracket), + '~' => Ok(Token::Tilde), + '+' => Ok(Token::Plus), + '*' => Ok(Token::Star), + '&' => Ok(Token::And), + ',' => Ok(Token::Comma), + '|' => Ok(Token::Or), + '=' => match self.peek() { + Some('>') => { + self.consume(); + Ok(Token::LBigArrow) + } + _ => Ok(Token::Eq), + }, + '-' => match self.peek() { + Some('>') => { + self.consume(); + Ok(Token::LSmallArrow) + } + _ => Err(Error::InvalidChar('-')), + }, + + '/' => match self.consume() { + Some('/') => loop { + if let Some('\n') | None = self.consume() { + break Ok(Token::Comment(&self.input[self.start + 2..self.position])); + } + }, + Some('*') => loop { + match self.consume() { + Some('*') if self.peek() == Some('/') => { + self.consume(); + break Ok(Token::Comment( + &self.input[self.start + 2..self.position - 2], + )); + } + Some(_) => {} + None => break Err(Error::UnclosedMultiLine), + } + }, + Some(_) => { + self.backtrack(); + Err(Error::InvalidChar('/')) + } + None => Err(Error::InvalidChar('/')), + }, + + c if c.is_alphabetic() || c == '_' => loop { + match self.consume() { + Some(c) if c.is_alphanumeric() || c == '_' => {} + Some(_) => { + self.backtrack(); + break Ok(Token::Ident(&self.input[self.start..self.position])); + } + None => break Ok(Token::Ident(&self.input[self.start..self.position])), + } + }, + + c => Err(Error::InvalidChar(c)), + }; + let span = Span(self.start, self.position); + self.start = self.position; + Some(Spanned(res, span)) + } +} + +#[test] +fn tokenizer() { + let tests = [ + "", + "/*", + "/**", + "/*/", + "/**/", + "/", + "//", + "()[]{}~=>==>->-+*&|, hello _th012is__ a wondweful", + ]; + + for test in tests { + println!("'{test}': {:?}", Lexer::new(test).collect::>()) + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..8c16373 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,11 @@ +pub mod dfa; +pub mod lexer; +pub mod parser; + +pub struct SymbolMap([T; 256]); + +#[derive(Clone, Copy, Hash, PartialEq, Eq)] +pub struct State(u16); + +#[derive(Clone, Copy, Hash, PartialEq, Eq)] +pub struct Symbol(u16); diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..db420e4 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,7 @@ +use automata::{lexer::Lexer, parser::Parser}; + +fn main() { + let input = include_str!("../example.txt"); + + println!("{:#?}", Parser::new(Lexer::new(input)).parse_elements()); +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..54c8aef --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,284 @@ +use std::iter::Peekable; + +use crate::lexer::{Lexer, Span, Spanned, Token}; + +#[derive(Clone, Debug)] +pub struct Tuple<'a>(pub Vec>>); + +#[derive(Clone, Debug)] +pub enum Symbol<'a> { + Epsilon, + Ident(&'a str), +} + +#[derive(Clone, Debug)] +pub enum Dest<'a> { + Ident(&'a str), + Function(Spanned<&'a str>, Spanned>), +} + +#[derive(Clone, Debug)] +pub enum Item<'a> { + Symbol(Symbol<'a>), + Tuple(Tuple<'a>), + List(List<'a>) +} + +#[derive(Clone, Debug)] +pub struct List<'a>(pub Vec>>); + +#[derive(Clone, Debug)] +pub enum TopLevel<'a> { + Assignment(Spanned>, Spanned>), + Table(), +} + +pub enum LogKind { + Lexer, + UnexpectedToken, +} + +pub enum LogLevel { + Info, + Warning, + Error, +} + +pub struct Log { + pub message: String, + pub range: Span, + pub level: LogLevel, + pub kind: LogKind, +} + +pub struct Parser<'a> { + lexer: Peekable>, + log: Vec, + eof: Span, +} + +impl<'a> Parser<'a> { + pub fn new(lexer: Lexer<'a>) -> Self{ + Parser { eof: lexer.eof_span(), lexer: lexer.peekable(), log: Vec::new() } + } + + fn next_token(&mut self) -> Option>> { + loop { + match self.lexer.next()? { + Spanned(Ok(Token::Comment(_)), _) => {} + Spanned(Ok(ok), r) => return Some(Spanned(ok, r)), + Spanned(Err(err), r) => self.log.push(Log { + message: format!("{err:?}"), + range: r, + level: LogLevel::Error, + kind: LogKind::Lexer, + }), + } + } + } + + fn peek_token(&mut self) -> Option>> { + loop { + match *self.lexer.peek()? { + // not a heavy clone but because of range + Spanned(Ok(ok), r) => return Some(Spanned(ok, r)), + Spanned(Err(err), r) => self.log.push(Log { + message: format!("{err:?}"), + range: r, + level: LogLevel::Error, + kind: LogKind::Lexer, + }), + } + } + } + + fn expect_token(&mut self, expected: Token<'a>) -> (bool, Span) { + if let Some(Spanned(token, range)) = self.next_token() { + if token != expected { + self.log.push(Log { + message: format!("unexpected token {:#}, expected {:}", token, expected), + range, + level: LogLevel::Error, + kind: LogKind::Lexer, + }); + (false, range) + }else{ + (true, range) + } + } else { + self.log.push(Log { + message: format!("unexpected eof expected {:#}", expected), + range: self.eof, + level: LogLevel::Error, + kind: LogKind::Lexer, + }); + (false, self.eof) + } + } + + pub fn parse_symbol(&mut self) -> Spanned> { + match self.next_token() { + Some(Spanned(Token::Tilde, r)) => Spanned(Symbol::Epsilon, r), + Some(Spanned(Token::Ident("epsilon"), r)) => Spanned(Symbol::Epsilon, r), + Some(Spanned(Token::Ident("ε"), r)) => Spanned(Symbol::Epsilon, r), + Some(Spanned(Token::Ident(ident), r)) => Spanned(Symbol::Ident(ident), r), + Some(Spanned(got, r)) => { + self.log.push(Log { + message: format!( + "unexpected token {:#}, expected {:}|{:}", + got, + Token::Tilde, + Token::Ident("") + ), + range: self.eof, + level: LogLevel::Error, + kind: LogKind::Lexer, + }); + Spanned(Symbol::Ident(""), r) + } + None => { + self.log.push(Log { + message: format!( + "unexpected eof expected {:}|{:}", + Token::Tilde, + Token::Ident("") + ), + range: self.eof, + level: LogLevel::Error, + kind: LogKind::Lexer, + }); + Spanned(Symbol::Ident(""), self.eof) + } + } + } + + pub fn parse_tupple(&mut self) -> Spanned> { + let mut items = Vec::new(); + let (matched, start) = self.expect_token(Token::LPar); + if !matched{ + return Spanned(Tuple(Vec::new()), start) + } + + while !matches!(self.peek_token(), Some(Spanned(Token::RPar, _))) { + items.push(self.parse_symbol()); + if matches!(self.peek_token(), Some(Spanned(Token::Comma, _))) { + self.next_token(); + } + if self.peek_token().is_none(){ + self.log.push(Log { + message: format!( + "unexpected eof expected {:}", + Token::RPar + ), + range: self.eof, + level: LogLevel::Error, + kind: LogKind::Lexer, + }); + break; + } + } + + let (_, end) = self.expect_token(Token::RPar); + + Spanned(Tuple(items), start.join(end)) + } + + pub fn parse_item(&mut self) -> Spanned>{ + match self.peek_token(){ + Some(Spanned(Token::Ident(_)|Token::Tilde, _)) => self.parse_symbol().map(Item::Symbol), + Some(Spanned(Token::LPar, _)) => self.parse_tupple().map(Item::Tuple), + Some(Spanned(Token::LBrace, _)) => self.parse_list().map(Item::List), + Some(Spanned(got, r)) => { + self.log.push(Log { + message: format!( + "unexpected token {:#}, expected {:}|{:}|{:}|{:}", + got, + Token::Tilde, + Token::Ident(""), + Token::LPar, + Token::LBrace + ), + range: self.eof, + level: LogLevel::Error, + kind: LogKind::Lexer, + }); + Spanned(Item::Symbol(Symbol::Ident("")), r) + } + None => { + self.log.push(Log { + message: format!( + "unexpected eof expected {:}|{:}|{:}|{:}", + Token::Tilde, + Token::Ident(""), + Token::LPar, + Token::LBrace + ), + range: self.eof, + level: LogLevel::Error, + kind: LogKind::Lexer, + }); + Spanned(Item::Symbol(Symbol::Ident("")), self.eof) + } + } + } + + pub fn parse_list(&mut self) -> Spanned>{ + let mut list = Vec::new(); + let (matched, start) = self.expect_token(Token::LBrace); + if !matched{ + return Spanned(List(Vec::new()), start) + } + + while !matches!(self.peek_token(), Some(Spanned(Token::RBrace, _))) { + list.push(self.parse_item()); + if matches!(self.peek_token(), Some(Spanned(Token::Comma, _))) { + self.next_token(); + } + if self.peek_token().is_none(){ + self.log.push(Log { + message: format!( + "unexpected eof expected {:}", + Token::RBrace + ), + range: self.eof, + level: LogLevel::Error, + kind: LogKind::Lexer, + }); + break; + } + } + let (_, end) = self.expect_token(Token::RBrace); + Spanned(List(list), start.join(end)) + } + + pub fn parse_elements(&mut self) -> Vec>> { + let mut result = Vec::new(); + + loop { + let Some(next) = self.next_token() else { break }; + match next { + Spanned(Token::Ident(ident), ident_range) => { + let dest @ Spanned(_, start) = if matches!(self.peek_token(), Some(Spanned(Token::LPar, _))) { + let tuple = self.parse_tupple(); + let span = ident_range.join(tuple.1); + Spanned(Dest::Function(Spanned(ident, ident_range), tuple), span) + } else { + Spanned(Dest::Ident(ident), ident_range) + }; + self.expect_token(Token::Eq); + + let item = self.parse_item(); + let span = start.join(item.1); + result.push(Spanned(TopLevel::Assignment(dest, item), span)); + } + _ => self.log.push(Log { + message: format!("unexpected token {:#}, expected {:}", next.0, Token::Ident("")), + range: next.1, + level: LogLevel::Error, + kind: LogKind::Lexer, + }), + } + } + result + } +}