This commit is contained in:
Parker TenBroeck 2025-12-18 21:49:13 -05:00
commit 7970cb197a
10 changed files with 595 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/target

7
Cargo.lock generated Normal file
View file

@ -0,0 +1,7 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "automata"
version = "0.1.0"

6
Cargo.toml Normal file
View file

@ -0,0 +1,6 @@
[package]
name = "automata"
version = "0.1.0"
edition = "2024"
[dependencies]

35
default.nix Normal file
View file

@ -0,0 +1,35 @@
{ pkgs ? import <nixpkgs> {} }:
pkgs.mkShell rec {
buildInputs = with pkgs; [
clang
# Replace llvmPackages with llvmPackages_X, where X is the latest LLVM version (at the time of writing, 16)
llvmPackages.bintools
rustup
nasm
];
RUSTC_VERSION = "nightly";
# https://github.com/rust-lang/rust-bindgen#environment-variables
LIBCLANG_PATH = pkgs.lib.makeLibraryPath [ pkgs.llvmPackages_latest.libclang.lib ];
shellHook = ''
export PATH=$PATH:''${CARGO_HOME:-~/.cargo}/bin
export PATH=$PATH:''${RUSTUP_HOME:-~/.rustup}/toolchains/$RUSTC_VERSION-x86_64-unknown-linux-gnu/bin/
'';
# Add precompiled library to rustc search path
RUSTFLAGS = (builtins.map (a: ''-L ${a}/lib'') [
# add libraries here (e.g. pkgs.libvmi)
]);
# Add glibc, clang, glib and other headers to bindgen search path
BINDGEN_EXTRA_CLANG_ARGS =
# Includes with normal include path
(builtins.map (a: ''-I"${a}/include"'') [
# add dev libraries here (e.g. pkgs.libvmi.dev)
pkgs.glibc.dev
])
# Includes with special directory paths
++ [
''-I"${pkgs.llvmPackages_latest.libclang.lib}/lib/clang/${pkgs.llvmPackages_latest.libclang.version}/include"''
''-I"${pkgs.glib.dev}/include/glib-2.0"''
''-I${pkgs.glib.out}/lib/glib-2.0/include/''
];
}

17
example.txt Normal file
View file

@ -0,0 +1,17 @@
Q = {q0, q1} // states
E = {a, b} // alphabet
T = {z0, A, B} // stack symbols
// construct all possible permutations of A's and B's
d(q0, epsilon, z0)={(q0, A z0), (q0, B z0)}
d(q0, epsilon, A)={(q0, A A), (q0, B A)}
d(q0, epsilon, B)={(q0, A B), (q0, B B)}
// transition to q1
d(q0, epsilon, z0)={(q1, z0)}
d(q0, epsilon, A)={(q1, A)}
d(q0, epsilon, B)={(q1, B)}
// consume stack until empty
d(q1, a, A)={(q1, epsilon)}
d(q1, b, B)={(q1, epsilon)}

12
src/dfa.rs Normal file
View file

@ -0,0 +1,12 @@
use crate::*;
pub struct TransitionTable {
initial: State,
state_names: Vec<String>,
transitions: Vec<SymbolMap<State>>,
final_states: Vec<bool>,
}
pub struct DFA {
state: State,
}

215
src/lexer.rs Normal file
View file

@ -0,0 +1,215 @@
#[derive(Clone, Copy, Hash, PartialEq, Eq, Debug)]
pub enum Token<'a> {
LPar,
RPar,
LBrace,
RBrace,
LBracket,
RBracket,
Tilde,
Eq,
Comma,
Or,
Plus,
Star,
And,
LSmallArrow,
LBigArrow,
Comment(&'a str),
Ident(&'a str),
}
impl<'a> std::fmt::Display for Token<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Token::LPar => write!(f, ")"),
Token::RPar => write!(f, "("),
Token::LBrace => write!(f, "{{"),
Token::RBrace => write!(f, "}}"),
Token::LBracket => write!(f, "["),
Token::RBracket => write!(f, "]"),
Token::Tilde => write!(f, "~"),
Token::Eq => write!(f, "="),
Token::Comma => write!(f, ","),
Token::Or => write!(f, "|"),
Token::Plus => write!(f, "+"),
Token::Star => write!(f, "*"),
Token::And => write!(f, "&"),
Token::LSmallArrow => write!(f, "->"),
Token::LBigArrow => write!(f, "=>"),
Token::Comment(_) => write!(f, "<comment>"),
Token::Ident(ident) if f.alternate() => write!(f, "{ident:?}"),
Token::Ident(_) => write!(f, "ident"),
}
}
}
#[derive(Clone, Copy, Hash, PartialEq, Eq, Debug)]
pub struct Span(pub usize, pub usize);
impl Span {
pub fn join(&self, end: Span) -> Span {
Span(self.0, end.1)
}
}
#[derive(Clone, Copy, Hash, PartialEq, Eq, Debug)]
pub struct Spanned<T>(pub T, pub Span);
impl<T> Spanned<T> {
pub fn map<R>(self, map: impl Fn(T) -> R) -> Spanned<R> {
Spanned(map(self.0), self.1)
}
}
#[derive(Clone, Copy, Debug)]
pub struct Lexer<'a> {
input: &'a str,
start: usize,
position: usize,
}
#[derive(Clone, Copy, Hash, PartialEq, Eq, Debug)]
pub enum Error {
InvalidChar(char),
UnclosedMultiLine,
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a str) -> Self {
Self {
input,
start: 0,
position: 0,
}
}
fn consume(&mut self) -> Option<char> {
let next = self.input.get(self.position..)?.chars().next()?;
self.position += next.len_utf8();
Some(next)
}
fn peek(&mut self) -> Option<char> {
self.input.get(self.position..)?.chars().next()
}
fn backtrack(&mut self) {
if let Some(consumed) = self.input.get(..self.position)
&& let Some(previous) = consumed.chars().next_back()
{
self.position -= previous.len_utf8();
}
}
pub fn eof_span(&self) -> Span {
Span(self.input.len(), self.input.len())
}
}
impl<'a> std::iter::Iterator for Lexer<'a> {
type Item = Spanned<Result<Token<'a>, Error>>;
fn next(&mut self) -> Option<Self::Item> {
while let Some(c) = self.peek()
&& c.is_whitespace()
{
self.consume();
}
self.start = self.position;
let res = match self.consume()? {
'(' => Ok(Token::LPar),
')' => Ok(Token::LPar),
'{' => Ok(Token::LBrace),
'}' => Ok(Token::RBrace),
'[' => Ok(Token::LBracket),
']' => Ok(Token::RBracket),
'~' => Ok(Token::Tilde),
'+' => Ok(Token::Plus),
'*' => Ok(Token::Star),
'&' => Ok(Token::And),
',' => Ok(Token::Comma),
'|' => Ok(Token::Or),
'=' => match self.peek() {
Some('>') => {
self.consume();
Ok(Token::LBigArrow)
}
_ => Ok(Token::Eq),
},
'-' => match self.peek() {
Some('>') => {
self.consume();
Ok(Token::LSmallArrow)
}
_ => Err(Error::InvalidChar('-')),
},
'/' => match self.consume() {
Some('/') => loop {
if let Some('\n') | None = self.consume() {
break Ok(Token::Comment(&self.input[self.start + 2..self.position]));
}
},
Some('*') => loop {
match self.consume() {
Some('*') if self.peek() == Some('/') => {
self.consume();
break Ok(Token::Comment(
&self.input[self.start + 2..self.position - 2],
));
}
Some(_) => {}
None => break Err(Error::UnclosedMultiLine),
}
},
Some(_) => {
self.backtrack();
Err(Error::InvalidChar('/'))
}
None => Err(Error::InvalidChar('/')),
},
c if c.is_alphabetic() || c == '_' => loop {
match self.consume() {
Some(c) if c.is_alphanumeric() || c == '_' => {}
Some(_) => {
self.backtrack();
break Ok(Token::Ident(&self.input[self.start..self.position]));
}
None => break Ok(Token::Ident(&self.input[self.start..self.position])),
}
},
c => Err(Error::InvalidChar(c)),
};
let span = Span(self.start, self.position);
self.start = self.position;
Some(Spanned(res, span))
}
}
#[test]
fn tokenizer() {
let tests = [
"",
"/*",
"/**",
"/*/",
"/**/",
"/",
"//",
"()[]{}~=>==>->-+*&|, hello _th012is__ a wondweful",
];
for test in tests {
println!("'{test}': {:?}", Lexer::new(test).collect::<Vec<_>>())
}
}

11
src/lib.rs Normal file
View file

@ -0,0 +1,11 @@
pub mod dfa;
pub mod lexer;
pub mod parser;
pub struct SymbolMap<T>([T; 256]);
#[derive(Clone, Copy, Hash, PartialEq, Eq)]
pub struct State(u16);
#[derive(Clone, Copy, Hash, PartialEq, Eq)]
pub struct Symbol(u16);

7
src/main.rs Normal file
View file

@ -0,0 +1,7 @@
use automata::{lexer::Lexer, parser::Parser};
fn main() {
let input = include_str!("../example.txt");
println!("{:#?}", Parser::new(Lexer::new(input)).parse_elements());
}

284
src/parser.rs Normal file
View file

@ -0,0 +1,284 @@
use std::iter::Peekable;
use crate::lexer::{Lexer, Span, Spanned, Token};
#[derive(Clone, Debug)]
pub struct Tuple<'a>(pub Vec<Spanned<Symbol<'a>>>);
#[derive(Clone, Debug)]
pub enum Symbol<'a> {
Epsilon,
Ident(&'a str),
}
#[derive(Clone, Debug)]
pub enum Dest<'a> {
Ident(&'a str),
Function(Spanned<&'a str>, Spanned<Tuple<'a>>),
}
#[derive(Clone, Debug)]
pub enum Item<'a> {
Symbol(Symbol<'a>),
Tuple(Tuple<'a>),
List(List<'a>)
}
#[derive(Clone, Debug)]
pub struct List<'a>(pub Vec<Spanned<Item<'a>>>);
#[derive(Clone, Debug)]
pub enum TopLevel<'a> {
Assignment(Spanned<Dest<'a>>, Spanned<Item<'a>>),
Table(),
}
pub enum LogKind {
Lexer,
UnexpectedToken,
}
pub enum LogLevel {
Info,
Warning,
Error,
}
pub struct Log {
pub message: String,
pub range: Span,
pub level: LogLevel,
pub kind: LogKind,
}
pub struct Parser<'a> {
lexer: Peekable<Lexer<'a>>,
log: Vec<Log>,
eof: Span,
}
impl<'a> Parser<'a> {
pub fn new(lexer: Lexer<'a>) -> Self{
Parser { eof: lexer.eof_span(), lexer: lexer.peekable(), log: Vec::new() }
}
fn next_token(&mut self) -> Option<Spanned<Token<'a>>> {
loop {
match self.lexer.next()? {
Spanned(Ok(Token::Comment(_)), _) => {}
Spanned(Ok(ok), r) => return Some(Spanned(ok, r)),
Spanned(Err(err), r) => self.log.push(Log {
message: format!("{err:?}"),
range: r,
level: LogLevel::Error,
kind: LogKind::Lexer,
}),
}
}
}
fn peek_token(&mut self) -> Option<Spanned<Token<'a>>> {
loop {
match *self.lexer.peek()? {
// not a heavy clone but because of range
Spanned(Ok(ok), r) => return Some(Spanned(ok, r)),
Spanned(Err(err), r) => self.log.push(Log {
message: format!("{err:?}"),
range: r,
level: LogLevel::Error,
kind: LogKind::Lexer,
}),
}
}
}
fn expect_token(&mut self, expected: Token<'a>) -> (bool, Span) {
if let Some(Spanned(token, range)) = self.next_token() {
if token != expected {
self.log.push(Log {
message: format!("unexpected token {:#}, expected {:}", token, expected),
range,
level: LogLevel::Error,
kind: LogKind::Lexer,
});
(false, range)
}else{
(true, range)
}
} else {
self.log.push(Log {
message: format!("unexpected eof expected {:#}", expected),
range: self.eof,
level: LogLevel::Error,
kind: LogKind::Lexer,
});
(false, self.eof)
}
}
pub fn parse_symbol(&mut self) -> Spanned<Symbol<'a>> {
match self.next_token() {
Some(Spanned(Token::Tilde, r)) => Spanned(Symbol::Epsilon, r),
Some(Spanned(Token::Ident("epsilon"), r)) => Spanned(Symbol::Epsilon, r),
Some(Spanned(Token::Ident("ε"), r)) => Spanned(Symbol::Epsilon, r),
Some(Spanned(Token::Ident(ident), r)) => Spanned(Symbol::Ident(ident), r),
Some(Spanned(got, r)) => {
self.log.push(Log {
message: format!(
"unexpected token {:#}, expected {:}|{:}",
got,
Token::Tilde,
Token::Ident("")
),
range: self.eof,
level: LogLevel::Error,
kind: LogKind::Lexer,
});
Spanned(Symbol::Ident("<INVALID>"), r)
}
None => {
self.log.push(Log {
message: format!(
"unexpected eof expected {:}|{:}",
Token::Tilde,
Token::Ident("")
),
range: self.eof,
level: LogLevel::Error,
kind: LogKind::Lexer,
});
Spanned(Symbol::Ident("<INVALID>"), self.eof)
}
}
}
pub fn parse_tupple(&mut self) -> Spanned<Tuple<'a>> {
let mut items = Vec::new();
let (matched, start) = self.expect_token(Token::LPar);
if !matched{
return Spanned(Tuple(Vec::new()), start)
}
while !matches!(self.peek_token(), Some(Spanned(Token::RPar, _))) {
items.push(self.parse_symbol());
if matches!(self.peek_token(), Some(Spanned(Token::Comma, _))) {
self.next_token();
}
if self.peek_token().is_none(){
self.log.push(Log {
message: format!(
"unexpected eof expected {:}",
Token::RPar
),
range: self.eof,
level: LogLevel::Error,
kind: LogKind::Lexer,
});
break;
}
}
let (_, end) = self.expect_token(Token::RPar);
Spanned(Tuple(items), start.join(end))
}
pub fn parse_item(&mut self) -> Spanned<Item<'a>>{
match self.peek_token(){
Some(Spanned(Token::Ident(_)|Token::Tilde, _)) => self.parse_symbol().map(Item::Symbol),
Some(Spanned(Token::LPar, _)) => self.parse_tupple().map(Item::Tuple),
Some(Spanned(Token::LBrace, _)) => self.parse_list().map(Item::List),
Some(Spanned(got, r)) => {
self.log.push(Log {
message: format!(
"unexpected token {:#}, expected {:}|{:}|{:}|{:}",
got,
Token::Tilde,
Token::Ident(""),
Token::LPar,
Token::LBrace
),
range: self.eof,
level: LogLevel::Error,
kind: LogKind::Lexer,
});
Spanned(Item::Symbol(Symbol::Ident("<INVALID>")), r)
}
None => {
self.log.push(Log {
message: format!(
"unexpected eof expected {:}|{:}|{:}|{:}",
Token::Tilde,
Token::Ident(""),
Token::LPar,
Token::LBrace
),
range: self.eof,
level: LogLevel::Error,
kind: LogKind::Lexer,
});
Spanned(Item::Symbol(Symbol::Ident("<INVALID>")), self.eof)
}
}
}
pub fn parse_list(&mut self) -> Spanned<List<'a>>{
let mut list = Vec::new();
let (matched, start) = self.expect_token(Token::LBrace);
if !matched{
return Spanned(List(Vec::new()), start)
}
while !matches!(self.peek_token(), Some(Spanned(Token::RBrace, _))) {
list.push(self.parse_item());
if matches!(self.peek_token(), Some(Spanned(Token::Comma, _))) {
self.next_token();
}
if self.peek_token().is_none(){
self.log.push(Log {
message: format!(
"unexpected eof expected {:}",
Token::RBrace
),
range: self.eof,
level: LogLevel::Error,
kind: LogKind::Lexer,
});
break;
}
}
let (_, end) = self.expect_token(Token::RBrace);
Spanned(List(list), start.join(end))
}
pub fn parse_elements(&mut self) -> Vec<Spanned<TopLevel<'a>>> {
let mut result = Vec::new();
loop {
let Some(next) = self.next_token() else { break };
match next {
Spanned(Token::Ident(ident), ident_range) => {
let dest @ Spanned(_, start) = if matches!(self.peek_token(), Some(Spanned(Token::LPar, _))) {
let tuple = self.parse_tupple();
let span = ident_range.join(tuple.1);
Spanned(Dest::Function(Spanned(ident, ident_range), tuple), span)
} else {
Spanned(Dest::Ident(ident), ident_range)
};
self.expect_token(Token::Eq);
let item = self.parse_item();
let span = start.join(item.1);
result.push(Spanned(TopLevel::Assignment(dest, item), span));
}
_ => self.log.push(Log {
message: format!("unexpected token {:#}, expected {:}", next.0, Token::Ident("")),
range: next.1,
level: LogLevel::Error,
kind: LogKind::Lexer,
}),
}
}
result
}
}