singularity-forge/rust-engine/crates/ast/src/language/mod.rs

511 lines
15 KiB
Rust

//! Vendored and extended language definitions for ast-grep integration.
//!
//! Originally derived from `ast-grep-language` v0.39.9, stripped of
//! serde/ignore machinery, and extended with additional languages.
mod parsers;
use std::{borrow::Cow, collections::HashMap, fmt, path::Path};
use ast_grep_core::{
Doc, Language, Node,
matcher::{KindMatcher, Pattern, PatternBuilder, PatternError},
meta_var::MetaVariable,
tree_sitter::{LanguageExt, StrDoc, TSLanguage, TSRange},
};
/// Implements a stub language (no expando / `pre_process_pattern` needed).
/// Use when the language grammar accepts `$VAR` as valid identifiers.
macro_rules! impl_lang {
($lang:ident, $func:ident) => {
#[derive(Clone, Copy, Debug)]
pub struct $lang;
impl Language for $lang {
fn kind_to_id(&self, kind: &str) -> u16 {
self.get_ts_language().id_for_node_kind(kind, true)
}
fn field_to_id(&self, field: &str) -> Option<u16> {
self
.get_ts_language()
.field_id_for_name(field)
.map(|f| f.get())
}
fn build_pattern(&self, builder: &PatternBuilder) -> Result<Pattern, PatternError> {
builder.build(|src| StrDoc::try_new(src, *self))
}
}
impl LanguageExt for $lang {
fn get_ts_language(&self) -> TSLanguage {
parsers::$func().into()
}
}
};
}
fn pre_process_pattern(expando: char, query: &str) -> Cow<'_, str> {
let mut ret = Vec::with_capacity(query.len());
let mut dollar_count = 0;
for c in query.chars() {
if c == '$' {
dollar_count += 1;
continue;
}
let need_replace = matches!(c, 'A'..='Z' | '_') || dollar_count == 3;
let sigil = if need_replace { expando } else { '$' };
ret.extend(std::iter::repeat_n(sigil, dollar_count));
dollar_count = 0;
ret.push(c);
}
let sigil = if dollar_count == 3 { expando } else { '$' };
ret.extend(std::iter::repeat_n(sigil, dollar_count));
Cow::Owned(ret.into_iter().collect())
}
/// Implements a language with `expando_char` / `pre_process_pattern`.
/// Use when the language does NOT accept `$` as a valid identifier character.
macro_rules! impl_lang_expando {
($lang:ident, $func:ident, $char:expr) => {
#[derive(Clone, Copy, Debug)]
pub struct $lang;
impl Language for $lang {
fn kind_to_id(&self, kind: &str) -> u16 {
self.get_ts_language().id_for_node_kind(kind, true)
}
fn field_to_id(&self, field: &str) -> Option<u16> {
self
.get_ts_language()
.field_id_for_name(field)
.map(|f| f.get())
}
fn expando_char(&self) -> char {
$char
}
fn pre_process_pattern<'q>(&self, query: &'q str) -> Cow<'q, str> {
pre_process_pattern(self.expando_char(), query)
}
fn build_pattern(&self, builder: &PatternBuilder) -> Result<Pattern, PatternError> {
builder.build(|src| StrDoc::try_new(src, *self))
}
}
impl LanguageExt for $lang {
fn get_ts_language(&self) -> TSLanguage {
parsers::$func().into()
}
}
};
}
// ── Customized languages with expando_char ──────────────────────────────
impl_lang_expando!(C, language_c, '𐀀');
impl_lang_expando!(Cpp, language_cpp, '𐀀');
impl_lang_expando!(CSharp, language_c_sharp, 'µ');
impl_lang_expando!(Css, language_css, '_');
impl_lang_expando!(Elixir, language_elixir, 'µ');
impl_lang_expando!(Go, language_go, 'µ');
impl_lang_expando!(Haskell, language_haskell, 'µ');
impl_lang_expando!(Hcl, language_hcl, 'µ');
impl_lang_expando!(Kotlin, language_kotlin, 'µ');
impl_lang_expando!(Nix, language_nix, '_');
impl_lang_expando!(Php, language_php, 'µ');
impl_lang_expando!(Python, language_python, 'µ');
impl_lang_expando!(Ruby, language_ruby, 'µ');
impl_lang_expando!(Rust, language_rust, 'µ');
impl_lang_expando!(Swift, language_swift, 'µ');
// New expando languages
impl_lang_expando!(Make, language_make, 'µ');
impl_lang_expando!(ObjC, language_objc, '𐀀');
impl_lang_expando!(Starlark, language_starlark, 'µ');
impl_lang_expando!(Odin, language_odin, 'µ');
impl_lang_expando!(Julia, language_julia, 'µ');
impl_lang_expando!(Verilog, language_verilog, 'µ');
impl_lang_expando!(Zig, language_zig, 'µ');
// ── Stub languages ($ accepted in grammar) ──────────────────────────────
impl_lang!(Bash, language_bash);
impl_lang!(Java, language_java);
impl_lang!(JavaScript, language_javascript);
impl_lang!(Json, language_json);
impl_lang!(Lua, language_lua);
impl_lang!(Scala, language_scala);
impl_lang!(Solidity, language_solidity);
impl_lang!(Tsx, language_tsx);
impl_lang!(TypeScript, language_typescript);
impl_lang!(Yaml, language_yaml);
// New stub languages
impl_lang!(Markdown, language_markdown);
impl_lang!(Toml, language_toml);
impl_lang!(Diff, language_diff);
impl_lang!(Xml, language_xml);
impl_lang!(Regex, language_regex);
// ── Html (custom implementation with injection support) ──────────────────
#[derive(Clone, Copy, Debug)]
pub struct Html;
impl Language for Html {
fn expando_char(&self) -> char {
'z'
}
fn pre_process_pattern<'q>(&self, query: &'q str) -> Cow<'q, str> {
pre_process_pattern(self.expando_char(), query)
}
fn kind_to_id(&self, kind: &str) -> u16 {
self.get_ts_language().id_for_node_kind(kind, true)
}
fn field_to_id(&self, field: &str) -> Option<u16> {
self
.get_ts_language()
.field_id_for_name(field)
.map(|f| f.get())
}
fn build_pattern(&self, builder: &PatternBuilder) -> Result<Pattern, PatternError> {
builder.build(|src| StrDoc::try_new(src, *self))
}
}
impl LanguageExt for Html {
fn get_ts_language(&self) -> TSLanguage {
parsers::language_html()
}
fn injectable_languages(&self) -> Option<&'static [&'static str]> {
Some(&["css", "js", "ts", "tsx", "scss", "less", "stylus", "coffee"])
}
fn extract_injections<L: LanguageExt>(
&self,
root: Node<StrDoc<L>>,
) -> HashMap<String, Vec<TSRange>> {
let lang = root.lang();
let mut map = HashMap::new();
let matcher = KindMatcher::new("script_element", lang.clone());
for script in root.find_all(matcher) {
let injected = find_html_lang(&script).unwrap_or_else(|| "js".into());
let content = script.children().find(|c| c.kind() == "raw_text");
if let Some(content) = content {
map.entry(injected)
.or_insert_with(Vec::new)
.push(node_to_range(&content));
}
}
let matcher = KindMatcher::new("style_element", lang.clone());
for style in root.find_all(matcher) {
let injected = find_html_lang(&style).unwrap_or_else(|| "css".into());
let content = style.children().find(|c| c.kind() == "raw_text");
if let Some(content) = content {
map.entry(injected)
.or_insert_with(Vec::new)
.push(node_to_range(&content));
}
}
map
}
}
fn find_html_lang<D: Doc>(node: &Node<D>) -> Option<String> {
let html = node.lang();
let attr_matcher = KindMatcher::new("attribute", html.clone());
let name_matcher = KindMatcher::new("attribute_name", html.clone());
let val_matcher = KindMatcher::new("attribute_value", html.clone());
node.find_all(attr_matcher).find_map(|attr| {
let name = attr.find(&name_matcher)?;
if name.text() != "lang" {
return None;
}
let val = attr.find(&val_matcher)?;
Some(val.text().to_string())
})
}
fn node_to_range<D: Doc>(node: &Node<D>) -> TSRange {
let r = node.range();
let start = node.start_pos();
let sp = start.byte_point();
let sp = tree_sitter::Point::new(sp.0, sp.1);
let end = node.end_pos();
let ep = end.byte_point();
let ep = tree_sitter::Point::new(ep.0, ep.1);
TSRange { start_byte: r.start, end_byte: r.end, start_point: sp, end_point: ep }
}
// ── SupportLang enum ────────────────────────────────────────────────────
/// All supported languages for ast-grep structural search/replace.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum SupportLang {
Bash,
C,
Cpp,
CSharp,
Css,
Diff,
Elixir,
Go,
Haskell,
Hcl,
Html,
Java,
JavaScript,
Json,
Julia,
Kotlin,
Lua,
Make,
Markdown,
Nix,
ObjC,
Odin,
Php,
Python,
Regex,
Ruby,
Rust,
Scala,
Solidity,
Starlark,
Swift,
Toml,
Tsx,
TypeScript,
Verilog,
Xml,
Yaml,
Zig,
}
impl SupportLang {
pub const fn all_langs() -> &'static [Self] {
use SupportLang::*;
&[
Bash, C, Cpp, CSharp, Css, Diff, Elixir, Go, Haskell, Hcl, Html, Java, JavaScript, Json,
Julia, Kotlin, Lua, Make, Markdown, Nix, ObjC, Odin, Php, Python, Regex, Ruby, Rust,
Scala, Solidity, Starlark, Swift, Toml, Tsx, TypeScript, Verilog, Xml, Yaml, Zig,
]
}
/// The canonical lowercase name used as a stable key in alias maps,
/// file-type inference results, and error messages.
pub const fn canonical_name(self) -> &'static str {
match self {
Self::Bash => "bash",
Self::C => "c",
Self::Cpp => "cpp",
Self::CSharp => "csharp",
Self::Css => "css",
Self::Diff => "diff",
Self::Elixir => "elixir",
Self::Go => "go",
Self::Haskell => "haskell",
Self::Hcl => "hcl",
Self::Html => "html",
Self::Java => "java",
Self::JavaScript => "javascript",
Self::Json => "json",
Self::Julia => "julia",
Self::Kotlin => "kotlin",
Self::Lua => "lua",
Self::Make => "make",
Self::Markdown => "markdown",
Self::Nix => "nix",
Self::ObjC => "objc",
Self::Odin => "odin",
Self::Php => "php",
Self::Python => "python",
Self::Regex => "regex",
Self::Ruby => "ruby",
Self::Rust => "rust",
Self::Scala => "scala",
Self::Solidity => "solidity",
Self::Starlark => "starlark",
Self::Swift => "swift",
Self::Toml => "toml",
Self::Tsx => "tsx",
Self::TypeScript => "typescript",
Self::Verilog => "verilog",
Self::Xml => "xml",
Self::Yaml => "yaml",
Self::Zig => "zig",
}
}
}
impl fmt::Display for SupportLang {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{self:?}")
}
}
// ── Dispatch macro ──────────────────────────────────────────────────────
macro_rules! execute_lang_method {
($me:path, $method:ident, $($pname:tt),*) => {
use SupportLang as S;
match $me {
S::Bash => Bash.$method($($pname,)*),
S::C => C.$method($($pname,)*),
S::Cpp => Cpp.$method($($pname,)*),
S::CSharp => CSharp.$method($($pname,)*),
S::Css => Css.$method($($pname,)*),
S::Diff => Diff.$method($($pname,)*),
S::Elixir => Elixir.$method($($pname,)*),
S::Go => Go.$method($($pname,)*),
S::Haskell => Haskell.$method($($pname,)*),
S::Hcl => Hcl.$method($($pname,)*),
S::Html => Html.$method($($pname,)*),
S::Java => Java.$method($($pname,)*),
S::JavaScript => JavaScript.$method($($pname,)*),
S::Json => Json.$method($($pname,)*),
S::Julia => Julia.$method($($pname,)*),
S::Kotlin => Kotlin.$method($($pname,)*),
S::Lua => Lua.$method($($pname,)*),
S::Make => Make.$method($($pname,)*),
S::Markdown => Markdown.$method($($pname,)*),
S::Nix => Nix.$method($($pname,)*),
S::ObjC => ObjC.$method($($pname,)*),
S::Odin => Odin.$method($($pname,)*),
S::Php => Php.$method($($pname,)*),
S::Python => Python.$method($($pname,)*),
S::Regex => Regex.$method($($pname,)*),
S::Ruby => Ruby.$method($($pname,)*),
S::Rust => Rust.$method($($pname,)*),
S::Scala => Scala.$method($($pname,)*),
S::Solidity => Solidity.$method($($pname,)*),
S::Starlark => Starlark.$method($($pname,)*),
S::Swift => Swift.$method($($pname,)*),
S::Toml => Toml.$method($($pname,)*),
S::Tsx => Tsx.$method($($pname,)*),
S::TypeScript => TypeScript.$method($($pname,)*),
S::Verilog => Verilog.$method($($pname,)*),
S::Xml => Xml.$method($($pname,)*),
S::Yaml => Yaml.$method($($pname,)*),
S::Zig => Zig.$method($($pname,)*),
}
};
}
macro_rules! impl_lang_method {
($method:ident, ($($pname:tt: $ptype:ty),*) => $return_type:ty) => {
#[inline]
fn $method(&self, $($pname: $ptype),*) -> $return_type {
execute_lang_method! { self, $method, $($pname),* }
}
};
}
impl Language for SupportLang {
impl_lang_method!(kind_to_id, (kind: &str) => u16);
impl_lang_method!(field_to_id, (field: &str) => Option<u16>);
impl_lang_method!(meta_var_char, () => char);
impl_lang_method!(expando_char, () => char);
impl_lang_method!(extract_meta_var, (source: &str) => Option<MetaVariable>);
impl_lang_method!(build_pattern, (builder: &PatternBuilder) => Result<Pattern, PatternError>);
fn pre_process_pattern<'q>(&self, query: &'q str) -> Cow<'q, str> {
execute_lang_method! { self, pre_process_pattern, query }
}
fn from_path<P: AsRef<Path>>(path: P) -> Option<Self> {
from_extension(path.as_ref())
}
}
impl LanguageExt for SupportLang {
impl_lang_method!(get_ts_language, () => TSLanguage);
impl_lang_method!(injectable_languages, () => Option<&'static [&'static str]>);
fn extract_injections<L: LanguageExt>(
&self,
root: Node<StrDoc<L>>,
) -> HashMap<String, Vec<TSRange>> {
match self {
Self::Html => Html.extract_injections(root),
_ => HashMap::new(),
}
}
}
// ── File extension mapping ──────────────────────────────────────────────
const fn extensions(lang: SupportLang) -> &'static [&'static str] {
use SupportLang::*;
match lang {
Bash => {
&["bash", "bats", "cgi", "command", "env", "fcgi", "ksh", "sh", "tmux", "tool", "zsh"]
},
C => &["c", "h"],
Cpp => &["cc", "hpp", "cpp", "c++", "hh", "cxx", "cu", "ino"],
CSharp => &["cs"],
Css => &["css", "scss"],
Diff => &["diff", "patch"],
Elixir => &["ex", "exs"],
Go => &["go"],
Haskell => &["hs"],
Hcl => &["hcl", "tf", "tfvars"],
Html => &["html", "htm", "xhtml"],
Java => &["java"],
JavaScript => &["cjs", "js", "mjs", "jsx"],
Json => &["json"],
Julia => &["jl"],
Kotlin => &["kt", "ktm", "kts"],
Lua => &["lua"],
Make => &["mk", "mak"],
Markdown => &["md", "markdown", "mdx"],
Nix => &["nix"],
ObjC => &["m"],
Odin => &["odin"],
Php => &["php"],
Python => &["py", "py3", "pyi", "bzl"],
Regex => &[], // regex has no file extension
Ruby => &["rb", "rbw", "gemspec"],
Rust => &["rs"],
Scala => &["scala", "sc", "sbt"],
Solidity => &["sol"],
Starlark => &["star", "bzl"],
Swift => &["swift"],
Toml => &["toml"],
Tsx => &["tsx"],
TypeScript => &["ts", "cts", "mts"],
Verilog => &["v", "sv", "svh", "vh"],
Xml => &["xml", "xsl", "xslt", "svg", "plist"],
Yaml => &["yaml", "yml"],
Zig => &["zig"],
}
}
/// Guess language from file extension.
fn from_extension(path: &Path) -> Option<SupportLang> {
let ext = path.extension()?.to_str()?;
// Special cases: Makefile has no extension
if ext.is_empty() {
let name = path.file_name()?.to_str()?;
return match name {
"Makefile" | "makefile" | "GNUmakefile" => Some(SupportLang::Make),
_ => None,
};
}
SupportLang::all_langs()
.iter()
.copied()
.find(|&l| extensions(l).contains(&ext))
}