Created
November 2, 2020 19:36
-
-
Save snasphysicist/9d9f3f6d91d4a77b414767e2438fa3b9 to your computer and use it in GitHub Desktop.
Experiment using Rust Analyzer & rustc to extract dev comments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use ra_syntax::{AstNode, File, SyntaxNodeRef, SyntaxKind}; | |
use regex::Regex; | |
use rustc_lexer::{tokenize, Token, TokenKind}; | |
/* | |
* [dependencies] | |
* ra_syntax = "0.1.0" | |
* regex = "1.4.1" | |
* rustc_lexer = "0.1.0" | |
*/ | |
const RUST_SOURCE: &str = | |
"/* block content */ \n\ | |
// non-documentation line content \n\ | |
/// outer documentation line content \n\ | |
fn main() {\n\ | |
//! inner documentation line content \n\ | |
}"; | |
/// We wrap the three types of object we're interested in - two types of non-documentation comment | |
/// and everything else - in this enum to make storing them in a single vector & matching against | |
/// them easier | |
#[derive(Debug)] | |
enum TokenType { | |
BlockComment(BlockComment), | |
LineComment(LineComment), | |
Other(Other) | |
} | |
impl TokenType { | |
/// For use with `rustc`, converts a piece of text content and the location of the start of | |
/// that content into a `TokenType`. Delegates to the `from_content` methods on the wrapped | |
/// types. | |
fn from_content(content: &str, location: usize) -> TokenType { | |
match BlockComment::from_content(content, location) { | |
Some(bc) => return TokenType::BlockComment(bc), | |
None => () | |
} | |
match LineComment::from_content(content, location) { | |
Some(lc) => return TokenType::LineComment(lc), | |
None => () | |
} | |
TokenType::Other(Other::from_content(content, location)) | |
} | |
/// For use with `rust_analyzer`, converts a syntax node into a `TokenType`. Delegates to the | |
/// `from_node` methods on the wrapped types. | |
fn from_node(node: SyntaxNodeRef) -> TokenType { | |
match BlockComment::from_node(node) { | |
Some(bc) => return TokenType::BlockComment(bc), | |
None => () | |
} | |
match LineComment::from_node(node) { | |
Some(lc) => return TokenType::LineComment(lc), | |
None => () | |
} | |
TokenType::Other(Other::from_node(node)) | |
} | |
} | |
/// Probably no longer needed, could be useful. Basically all the things we're interested in know | |
/// how to print themselves as source code. | |
trait TextToken { | |
fn as_source(&self) -> String; | |
} | |
/// Represents a developer block comment. Size is stored so we don't lose ordering information if | |
/// we want to print the source file back out. Note that the content is the content of the comment, | |
/// not including the comment markers (/* */). | |
#[derive(Debug)] | |
struct BlockComment { | |
start: usize, | |
content: String | |
} | |
impl BlockComment { | |
/// For use with `rustc`, converts a piece of text content and the location of the start of | |
/// the content to a `BlockComment`, if possible. | |
fn from_content(content: &str, location: usize) -> Option<BlockComment> { | |
let block_comment = Regex::new(r"^/\*(?P<content>.*)\*/$").unwrap(); | |
match block_comment.captures(content) { | |
Some(c) => match c.get(1) { | |
Some(m) => Some(BlockComment{ start: location, content: m.as_str().to_string() }), | |
None => None | |
} | |
None => None | |
} | |
} | |
/// For use with `rust_analyzer`, converts a syntax node into a `BlockComment` if possible. | |
fn from_node(node: SyntaxNodeRef) -> Option<BlockComment> { | |
match node.kind() { | |
SyntaxKind::COMMENT => BlockComment::from_content( | |
&node.text().to_string(), node.range().start().to_usize()), | |
_ => None | |
} | |
} | |
} | |
impl TextToken for BlockComment { | |
/// Print the block comment out as it would appear in the source file. | |
fn as_source(&self) -> String { | |
format!("/*{}*/", self.content) | |
} | |
} | |
/// Represents a developer line comment. Size is stored so we don't lose ordering information if | |
/// we want to print the source file back out. Note that the content is the content of the comment, | |
/// not including the comment marker (//). | |
#[derive(Debug)] | |
struct LineComment { | |
start: usize, | |
content: String | |
} | |
impl LineComment { | |
/// For use with `rustc`, converts a piece of text content and the location of the start of | |
/// the content to a `LineComment`, if possible. | |
fn from_content(content: &str, location: usize) -> Option<LineComment> { | |
let line_comment = Regex::new(r"^//([^/].*)$").unwrap(); | |
match line_comment.captures(content) { | |
Some(c) => match c.get(1) { | |
Some(m) => Some(LineComment{ start: location, content: m.as_str().to_string() }), | |
None => None | |
} | |
None => None | |
} | |
} | |
/// For use with `rust_analyzer`, converts a syntax node into a `LineComment` if possible. | |
fn from_node(node: SyntaxNodeRef) -> Option<LineComment> { | |
match node.kind() { | |
SyntaxKind::COMMENT => LineComment::from_content( | |
&node.text().to_string(), node.range().start().to_usize()), | |
_ => None | |
} | |
} | |
} | |
impl TextToken for LineComment { | |
/// Print the line comment out as it would appear in the source file. | |
fn as_source(&self) -> String { | |
format!("//{}", self.content) | |
} | |
} | |
/// For anything which is not a developer block comment or a developer line comment - we don't care | |
/// about the details because the goal here is only to spellcheck those. We store the full text for | |
/// that part of the source. | |
#[derive(Debug)] | |
struct Other { | |
start: usize, | |
content: String | |
} | |
impl Other { | |
/// For use with `rustc`, converts a piece of text content and the location of the start of | |
/// the content to an `Other`. Never fails, so try after `BlockComment`/`LineComment`. | |
fn from_content(content: &str, location: usize) -> Other { | |
Other{ start: location, content: content.to_string() } | |
} | |
/// For use with `rust_analyzer`, converts a syntax node to an `Other`. | |
/// Never fails, so try after `BlockComment`/`LineComment`. | |
fn from_node(node: SyntaxNodeRef) -> Other { | |
Other{ start: node.range().start().to_usize(), content: node.text().to_string()} | |
} | |
} | |
/// For stuff we aren't spell-checking here, we print the content straight out and don't have to | |
/// add any other content. | |
impl TextToken for Other { | |
fn as_source(&self) -> String { | |
format!("{}", self.content) | |
} | |
} | |
/// Intermediate step when using `rustc`, that associates the location information to the text. | |
/// Probably not all fields in this struct are required. | |
struct TokenWithLocation { | |
kind: TokenKind, | |
start: usize, | |
end: usize, | |
content: String | |
} | |
fn main() { | |
println!("\nRUSTC\n"); | |
let rustc = parse_with_rustc(RUST_SOURCE); | |
print_tokens(&rustc); | |
println!("\nRUST_ANALYZER\n"); | |
let rust_analyzer = parse_with_rust_analyzer(RUST_SOURCE); | |
print_tokens(&rust_analyzer); | |
} | |
/// Prints a series of `TokenTypes` out, one per line, for debug. | |
fn print_tokens(tokens: &Vec<TokenType>) -> () { | |
for t in tokens { | |
println!("{:?}", t); | |
} | |
} | |
/// Parse the provided source into `TokenTypes` using `rustc`. | |
fn parse_with_rustc(source: &str) -> Vec<TokenType> { | |
let tokenized = tokenize_with_location(source); | |
tokenized.into_iter() | |
.map(|t| TokenType::from_content(&t.content, t.start)) | |
.collect() | |
} | |
/// Tokenize the provided source using `rustc` and extract the content/location for each token | |
fn tokenize_with_location(source: &str) -> Vec<TokenWithLocation> { | |
let tokenized = tokenize(source); | |
let mut location = 0; | |
let mut tokens = vec!(); | |
for token in tokenized { | |
tokens.push(TokenWithLocation{ | |
kind: token.kind, | |
start: location, | |
end: location + token.len, | |
content: source[location..location + token.len].to_string() | |
}); | |
location += token.len; | |
} | |
tokens | |
} | |
/// Parse the given source with `rust_analyzer` and convert into `TokenType`s | |
fn parse_with_rust_analyzer(source: &str) -> Vec<TokenType> { | |
let parsed = File::parse(source); | |
parsed.ast() | |
.syntax() | |
.descendants() | |
.filter(|node| node.is_leaf() ) | |
.map(|node| TokenType::from_node(node)) | |
.collect() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment