Skip to content

Instantly share code, notes, and snippets.

@snasphysicist
Created November 2, 2020 19:36
Show Gist options
  • Save snasphysicist/9d9f3f6d91d4a77b414767e2438fa3b9 to your computer and use it in GitHub Desktop.
Save snasphysicist/9d9f3f6d91d4a77b414767e2438fa3b9 to your computer and use it in GitHub Desktop.
Experiment using Rust Analyzer & rustc to extract dev comments
use ra_syntax::{AstNode, File, SyntaxNodeRef, SyntaxKind};
use regex::Regex;
use rustc_lexer::{tokenize, Token, TokenKind};
/*
* [dependencies]
* ra_syntax = "0.1.0"
* regex = "1.4.1"
* rustc_lexer = "0.1.0"
*/
const RUST_SOURCE: &str =
"/* block content */ \n\
// non-documentation line content \n\
/// outer documentation line content \n\
fn main() {\n\
//! inner documentation line content \n\
}";
/// We wrap the three types of object we're interested in - two types of non-documentation comment
/// and everything else - in this enum to make storing them in a single vector & matching against
/// them easier
#[derive(Debug)]
enum TokenType {
BlockComment(BlockComment),
LineComment(LineComment),
Other(Other)
}
impl TokenType {
/// For use with `rustc`, converts a piece of text content and the location of the start of
/// that content into a `TokenType`. Delegates to the `from_content` methods on the wrapped
/// types.
fn from_content(content: &str, location: usize) -> TokenType {
match BlockComment::from_content(content, location) {
Some(bc) => return TokenType::BlockComment(bc),
None => ()
}
match LineComment::from_content(content, location) {
Some(lc) => return TokenType::LineComment(lc),
None => ()
}
TokenType::Other(Other::from_content(content, location))
}
/// For use with `rust_analyzer`, converts a syntax node into a `TokenType`. Delegates to the
/// `from_node` methods on the wrapped types.
fn from_node(node: SyntaxNodeRef) -> TokenType {
match BlockComment::from_node(node) {
Some(bc) => return TokenType::BlockComment(bc),
None => ()
}
match LineComment::from_node(node) {
Some(lc) => return TokenType::LineComment(lc),
None => ()
}
TokenType::Other(Other::from_node(node))
}
}
/// Probably no longer needed, could be useful. Basically all the things we're interested in know
/// how to print themselves as source code.
trait TextToken {
fn as_source(&self) -> String;
}
/// Represents a developer block comment. Size is stored so we don't lose ordering information if
/// we want to print the source file back out. Note that the content is the content of the comment,
/// not including the comment markers (/* */).
#[derive(Debug)]
struct BlockComment {
start: usize,
content: String
}
impl BlockComment {
/// For use with `rustc`, converts a piece of text content and the location of the start of
/// the content to a `BlockComment`, if possible.
fn from_content(content: &str, location: usize) -> Option<BlockComment> {
let block_comment = Regex::new(r"^/\*(?P<content>.*)\*/$").unwrap();
match block_comment.captures(content) {
Some(c) => match c.get(1) {
Some(m) => Some(BlockComment{ start: location, content: m.as_str().to_string() }),
None => None
}
None => None
}
}
/// For use with `rust_analyzer`, converts a syntax node into a `BlockComment` if possible.
fn from_node(node: SyntaxNodeRef) -> Option<BlockComment> {
match node.kind() {
SyntaxKind::COMMENT => BlockComment::from_content(
&node.text().to_string(), node.range().start().to_usize()),
_ => None
}
}
}
impl TextToken for BlockComment {
/// Print the block comment out as it would appear in the source file.
fn as_source(&self) -> String {
format!("/*{}*/", self.content)
}
}
/// Represents a developer line comment. Size is stored so we don't lose ordering information if
/// we want to print the source file back out. Note that the content is the content of the comment,
/// not including the comment marker (//).
#[derive(Debug)]
struct LineComment {
start: usize,
content: String
}
impl LineComment {
/// For use with `rustc`, converts a piece of text content and the location of the start of
/// the content to a `LineComment`, if possible.
fn from_content(content: &str, location: usize) -> Option<LineComment> {
let line_comment = Regex::new(r"^//([^/].*)$").unwrap();
match line_comment.captures(content) {
Some(c) => match c.get(1) {
Some(m) => Some(LineComment{ start: location, content: m.as_str().to_string() }),
None => None
}
None => None
}
}
/// For use with `rust_analyzer`, converts a syntax node into a `LineComment` if possible.
fn from_node(node: SyntaxNodeRef) -> Option<LineComment> {
match node.kind() {
SyntaxKind::COMMENT => LineComment::from_content(
&node.text().to_string(), node.range().start().to_usize()),
_ => None
}
}
}
impl TextToken for LineComment {
/// Print the line comment out as it would appear in the source file.
fn as_source(&self) -> String {
format!("//{}", self.content)
}
}
/// For anything which is not a developer block comment or a developer line comment - we don't care
/// about the details because the goal here is only to spellcheck those. We store the full text for
/// that part of the source.
#[derive(Debug)]
struct Other {
start: usize,
content: String
}
impl Other {
/// For use with `rustc`, converts a piece of text content and the location of the start of
/// the content to an `Other`. Never fails, so try after `BlockComment`/`LineComment`.
fn from_content(content: &str, location: usize) -> Other {
Other{ start: location, content: content.to_string() }
}
/// For use with `rust_analyzer`, converts a syntax node to an `Other`.
/// Never fails, so try after `BlockComment`/`LineComment`.
fn from_node(node: SyntaxNodeRef) -> Other {
Other{ start: node.range().start().to_usize(), content: node.text().to_string()}
}
}
/// For stuff we aren't spell-checking here, we print the content straight out and don't have to
/// add any other content.
impl TextToken for Other {
fn as_source(&self) -> String {
format!("{}", self.content)
}
}
/// Intermediate step when using `rustc`, that associates the location information to the text.
/// Probably not all fields in this struct are required.
struct TokenWithLocation {
kind: TokenKind,
start: usize,
end: usize,
content: String
}
fn main() {
println!("\nRUSTC\n");
let rustc = parse_with_rustc(RUST_SOURCE);
print_tokens(&rustc);
println!("\nRUST_ANALYZER\n");
let rust_analyzer = parse_with_rust_analyzer(RUST_SOURCE);
print_tokens(&rust_analyzer);
}
/// Prints a series of `TokenTypes` out, one per line, for debug.
fn print_tokens(tokens: &Vec<TokenType>) -> () {
for t in tokens {
println!("{:?}", t);
}
}
/// Parse the provided source into `TokenTypes` using `rustc`.
fn parse_with_rustc(source: &str) -> Vec<TokenType> {
let tokenized = tokenize_with_location(source);
tokenized.into_iter()
.map(|t| TokenType::from_content(&t.content, t.start))
.collect()
}
/// Tokenize the provided source using `rustc` and extract the content/location for each token
fn tokenize_with_location(source: &str) -> Vec<TokenWithLocation> {
let tokenized = tokenize(source);
let mut location = 0;
let mut tokens = vec!();
for token in tokenized {
tokens.push(TokenWithLocation{
kind: token.kind,
start: location,
end: location + token.len,
content: source[location..location + token.len].to_string()
});
location += token.len;
}
tokens
}
/// Parse the given source with `rust_analyzer` and convert into `TokenType`s
fn parse_with_rust_analyzer(source: &str) -> Vec<TokenType> {
let parsed = File::parse(source);
parsed.ast()
.syntax()
.descendants()
.filter(|node| node.is_leaf() )
.map(|node| TokenType::from_node(node))
.collect()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment