Skip to content

Instantly share code, notes, and snippets.

@quark-zju
Created February 26, 2020 23:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save quark-zju/600f9f869f83384b87c66a4408c78bef to your computer and use it in GitHub Desktop.
Save quark-zju/600f9f869f83384b87c66a4408c78bef to your computer and use it in GitHub Desktop.
[package]
name = "pathmatcher"
version = "0.1.0"
edition = "2018"
[lib]
path = "lib.rs"
[dependencies]
bitflags = "1.0"
# good: 0.4.2
globset = "= 0.4.2"
# bad: 0.4.3
# globset = "= 0.4.3"
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This software may be used and distributed according to the terms of the
* GNU General Public License version 2.
*/
//! Tree-aware pattern matcher
//!
//! [TreeMatcher] is the main structure.
use bitflags::bitflags;
use globset::{Glob, GlobBuilder, GlobSet, GlobSetBuilder};
use std::path::Path;
bitflags! {
struct RuleFlags: u8 {
// A negative rule.
const NEGATIVE = 1;
// Auto-generated rule because the user specified a subpath.
const PARENT = 2;
// Mark a rule as "recursive" (ex. ending with "/**").
const RECURSIVE = 4;
}
}
/// Pattern matcher constructed by an ordered list of positive and negative
/// glob patterns. Negative patterns are prefixed with `!`.
///
/// The syntax is quite similar to gitignore, with some difference to avoid
/// inefficient uses. See [`TreeMatcher::from_rules`] for details about the
/// patterns.
///
/// The [TreeMatcher::match_recursive] method can be used to rule out
/// unnecessary directory visit early.
#[derive(Clone, Debug)]
pub struct TreeMatcher {
// The [GlobSet] takes care of many algorithm stuff. It can match a path
// against multiple patterns and return the pattern indexes.
glob_set: GlobSet,
// Flags (ex. negative rule or is it a parent directory) for additional
// information matching the pattern indexes.
rule_flags: Vec<RuleFlags>,
}
impl TreeMatcher {
/// Create [TreeMatcher] using an ordered list of patterns.
///
/// The patterns are glob patterns supported by the `globset` crate.
/// Like gitignore, negative patterns are supported. They are prefixed
/// with `!`. Special characters can be escaped by prefixing `\\`.
///
/// Patterns are ordered. A later pattern always overrides an earlier
/// pattern. Invalid patterns are ignored.
///
/// Unlike gitignore, all patterns are treated as using absolute paths.
/// That is, `*.c` is treated the same as `/*.c` and does not match `a/b.c`.
/// Similarly, `!*.c` will be treated as `!/*.c`, in gitignore's sense.
/// Use `**/*.c` to match files recursively. Note the `**` in the middle
/// of a pattern effectively disable fast paths provided by `match_recursive`.
///
/// Patterns do not match recursively.
///
/// For example, both `/a/b` and `/a*/b*` do NOT match `/a/b/c/d`. Append
/// `/**` to make rules recursive. The matcher works best if all rules end
/// with `**`.
pub fn from_rules(
rules: impl Iterator<Item = impl AsRef<str>>,
) -> Result<Self, globset::Error> {
let mut builder = GlobSetBuilder::new();
let mut rule_flags = Vec::new();
for rule in rules {
let rule = rule.as_ref();
let (negative, rule) = if rule.starts_with("!") {
(true, &rule[1..])
} else {
(false, rule)
};
// Strip a leading "/". More friendly to gitignore users.
let rule = if rule.starts_with("/") {
&rule[1..]
} else {
rule
};
// "{", "}" do not have special meaning in gitignore, while
// globset treats them differently.
//
// For now, workaround it by escaping. In the future, this can
// possibly be done by tweaking a GlobBuilder option in
// build_globs().
//
// See https://github.com/BurntSushi/ripgrep/issues/1183.
let rule = escape_curly_brackets(&rule);
// Add flags to the rule_id
let mut flag = if negative {
RuleFlags::NEGATIVE
} else {
RuleFlags::empty()
};
// Insert "parent" rules so match_recursive won't return "None"
// incorrectly.
let mut sep_index = 0;
let rule_bytes = rule.as_ref();
while let Some(index) = next_path_separator(rule_bytes, sep_index) {
if index > 0 && index < rule_bytes.len() - 1 {
let parent_rule = &rule[..index];
for glob in build_globs(parent_rule)? {
builder.add(glob);
rule_flags.push(flag | RuleFlags::PARENT);
}
}
sep_index = index + 1;
}
// Mark the rule as recursive so fast paths (i.e. claim everything
// matches or nothing matches) can be used.
if rule.ends_with("/**") || rule.ends_with("**/*") || rule == "**" {
flag |= RuleFlags::RECURSIVE;
}
// Insert the rule.
// NOTE: This crate depends on the fact that "a/**" matches "a", although
// the documentation of globset might say otherwise.
for glob in build_globs(&rule)? {
builder.add(glob);
rule_flags.push(flag);
}
}
let glob_set = builder.build()?;
let matcher = Self {
glob_set,
rule_flags,
};
Ok(matcher)
}
/// Create [TreeMatcher] that matches nothing.
pub fn never() -> Self {
let rules: [&str; 0] = [];
TreeMatcher::from_rules(rules.iter()).unwrap()
}
/// Create [TreeMatcher] that matches everything.
pub fn always() -> Self {
let rules: [&str; 1] = ["**"];
TreeMatcher::from_rules(rules.iter()).unwrap()
}
/// Return `Some(bool)` if for all path inside the given `dir`,
/// `matches(path)` will return `bool`.
///
/// Return `None` if there is no fast path.
///
/// `/` should be used as the path separator, regardless of system.
pub fn match_recursive(&self, dir: impl AsRef<Path>) -> Option<bool> {
let dir = dir.as_ref();
// A subpath may match - cannot return Some(false)
let mut subpath_may_match = false;
// A subpath may mismatch - cannot return Some(true)
let mut subpath_may_mismatch = false;
for id in self.glob_set.matches(dir).into_iter().rev() {
let flag = self.rule_flags[id];
if flag.contains(RuleFlags::PARENT) {
// An auto-generated parent rule matches.
if flag.contains(RuleFlags::NEGATIVE) {
subpath_may_mismatch = true;
} else {
subpath_may_match = true;
}
} else {
// If it is not RECURSIVE, then fast paths (i.e. claim everything
// matches, or nothing matches) cannot be used.
if !flag.contains(RuleFlags::RECURSIVE) {
subpath_may_match = true;
subpath_may_mismatch = true;
}
// A non-parent rule matches.
if flag.contains(RuleFlags::NEGATIVE) {
if subpath_may_match {
return None;
} else {
return Some(false);
}
} else {
if subpath_may_mismatch {
return None;
} else {
return Some(true);
}
}
}
}
if subpath_may_match {
None
} else if !self.rule_flags.is_empty() && dir.to_str() == Some("") {
// Special case: empty dir
None
} else {
Some(false)
}
}
/// Return if `path` matches with the matcher.
///
/// `/` should be used as the path separator, regardless of system.
pub fn matches(&self, path: impl AsRef<Path>) -> bool {
for id in self.glob_set.matches(path).into_iter().rev() {
let flag = self.rule_flags[id];
if flag.contains(RuleFlags::PARENT) {
// For full path matches, parent rules do not count.
continue;
} else if flag.contains(RuleFlags::NEGATIVE) {
// Not matched.
return false;
} else {
// Matched.
return true;
}
}
// No rule matches
false
}
}
fn build_globs(pat: &str) -> Result<Vec<Glob>, globset::Error> {
// Fast path (maybe).
if pat.ends_with("/**") {
// Rewrite "foo/**" (literal_separator=true) to "foo" (literal_separator=false) and "foo/*"
// (literal_separator=false) so MatchStrategy::Literal and MatchStrategy::Prefix (instead
// of MatchStrategy::Regex) might be used. See globset::glob::Glob::{literal,prefix}.
let prefix = &pat[..pat.len() - 3];
if !prefix.contains("?") && !prefix.contains("*") {
let rules = [prefix, &pat[..pat.len() - 1]];
return rules
.iter()
.map(|r| GlobBuilder::new(r).backslash_escape(true).build())
.collect();
}
}
// General path.
let glob = GlobBuilder::new(pat)
.literal_separator(true) // `*` or `?` should not match `/`
.backslash_escape(true)
.build()?;
Ok(vec![glob])
}
/// Find the next path separator in a pattern. Respect escaping rules.
/// Return the index (>= `start`), or None if there are no remaining path separator.
fn next_path_separator(pat: &[u8], start: usize) -> Option<usize> {
let mut in_box_brackets = false;
let mut escaped = false;
for (i, ch) in pat.iter().skip(start).enumerate() {
if escaped {
match ch {
_ => escaped = false,
}
} else if in_box_brackets {
match ch {
b']' => in_box_brackets = false,
_ => (),
}
} else {
match ch {
b'\\' => escaped = true,
b'[' => in_box_brackets = true,
b'/' => return Some(i + start),
_ => (),
}
}
}
None
}
/// Escape `{` and `}` so they no longer have special meanings to `globset`.
fn escape_curly_brackets(pat: &str) -> String {
if pat.contains('{') || pat.contains('}') {
let mut result = String::with_capacity(pat.len() * 2);
for ch in pat.chars() {
match ch {
'{' => result.push_str("\\{"),
'}' => result.push_str("\\}"),
ch => result.push(ch),
}
}
result
} else {
// No escaping is needed
pat.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_never_matcher() {
let m = TreeMatcher::never();
assert_eq!(m.match_recursive(""), Some(false));
assert_eq!(m.match_recursive("a"), Some(false));
assert_eq!(m.match_recursive("a/b"), Some(false));
assert_eq!(m.matches(""), false);
assert_eq!(m.matches("a/b"), false);
}
#[test]
fn test_always_matcher() {
let m = TreeMatcher::always();
assert_eq!(m.match_recursive(""), Some(true));
assert_eq!(m.match_recursive("a"), Some(true));
assert_eq!(m.match_recursive("a/b"), Some(true));
assert_eq!(m.matches(""), true);
assert_eq!(m.matches("a/b"), true);
}
#[test]
fn test_literal_paths() {
let m = TreeMatcher::from_rules(["/a/**", "b/c/d/**", "\\e/\\f/**"].iter()).unwrap();
assert_eq!(m.match_recursive(""), None);
assert_eq!(m.match_recursive("a"), Some(true));
assert_eq!(m.match_recursive("a/b"), Some(true));
assert_eq!(m.match_recursive("b"), None);
assert_eq!(m.match_recursive("b/c"), None);
assert_eq!(m.match_recursive("b/c/d"), Some(true));
assert_eq!(m.match_recursive("b/c/d/e"), Some(true));
assert_eq!(m.match_recursive("e/f/g"), Some(true));
assert_eq!(m.match_recursive("c"), Some(false));
assert_eq!(m.match_recursive("c/a"), Some(false));
assert_eq!(m.matches(""), false);
assert_eq!(m.matches("a/b"), true);
assert_eq!(m.matches("b/x"), false);
assert_eq!(m.matches("b/c/d/e"), true);
assert_eq!(m.matches("e"), false);
assert_eq!(m.matches("e/f1"), false);
assert_eq!(m.matches("e/f/g"), true);
}
#[test]
fn test_simple_glob() {
let m = TreeMatcher::from_rules(["a/*[cd][ef]/**"].iter()).unwrap();
assert_eq!(m.match_recursive("a"), None);
assert_eq!(m.match_recursive("b"), Some(false));
assert_eq!(m.match_recursive("a/x"), Some(false));
assert_eq!(m.match_recursive("a/xde"), Some(true));
assert_eq!(m.match_recursive("a/xde/x"), Some(true));
assert_eq!(m.matches("a/12df"), true);
assert_eq!(m.matches("a/12df/12df"), true);
}
#[test]
fn test_complex_glob() {
let m = TreeMatcher::from_rules(["a/v/**/*.c/**", "a/**/w/*.c/**"].iter()).unwrap();
assert_eq!(m.match_recursive("a"), None);
assert_eq!(m.match_recursive("b"), Some(false));
assert_eq!(m.match_recursive("a/v/.c"), Some(true));
assert_eq!(m.match_recursive("a/v/.c/z"), Some(true));
assert_eq!(m.match_recursive("a/z"), None);
assert_eq!(m.matches("v/.c"), false);
assert_eq!(m.matches("a/v/.c"), true);
assert_eq!(m.matches("a/w/.c"), true);
assert_eq!(m.matches("a/v/c/v/c/v/c/v/c/v.c"), true);
assert_eq!(m.matches("a/c/c/c/c/w/w.c"), true);
assert_eq!(m.matches("a/w/v/w.c"), false);
// "{" has no special meaning
let m = TreeMatcher::from_rules(["a/{b,c/d}/**"].iter()).unwrap();
assert_eq!(m.match_recursive("a/{b,c/d}"), Some(true));
assert_eq!(m.match_recursive("a/{b,c"), None);
assert_eq!(m.match_recursive("a/{b,d"), Some(false));
}
#[test]
fn test_mixed_literal_and_simple_glob() {
let m = TreeMatcher::from_rules(["b/c/d/**", "b/*c/**", "b/1c/**"].iter()).unwrap();
assert_eq!(m.match_recursive(""), None);
assert_eq!(m.match_recursive("b/c/d/e"), Some(true));
assert_eq!(m.match_recursive("b/1c"), Some(true));
assert_eq!(m.match_recursive("b/xc/yc"), Some(true));
assert_eq!(m.match_recursive("b/xc"), Some(true));
assert_eq!(m.match_recursive("b/d"), Some(false));
assert_eq!(m.matches("b/c/d/e/f"), true);
assert_eq!(m.matches("b/fc"), true);
assert_eq!(m.matches("b/ce"), false);
assert_eq!(m.matches("b/c/e"), true);
}
#[test]
fn test_mixed_literal_and_complex_glob() {
let m = TreeMatcher::from_rules(["b/c/d/**", "b/**/c/**"].iter()).unwrap();
assert_eq!(m.match_recursive("b/c/d/e"), Some(true));
assert_eq!(m.match_recursive("b/d"), None);
assert_eq!(m.match_recursive("b/c"), Some(true));
assert_eq!(m.match_recursive("b/x/c/y"), Some(true));
assert_eq!(m.matches("b/c/d/e/f"), true);
assert_eq!(m.matches("b/c/d"), true);
assert_eq!(m.matches("b/c"), true);
assert_eq!(m.matches("b"), false);
assert_eq!(m.matches("b/x/y/c/x/y"), true);
}
#[test]
fn test_empty_negative() {
let m = TreeMatcher::from_rules(["!a/**"].iter()).unwrap();
assert_eq!(m.match_recursive(""), None); // better answer is Some(false)
assert_eq!(m.match_recursive("a"), Some(false));
assert_eq!(m.match_recursive("a/b"), Some(false));
assert_eq!(m.matches(""), false);
assert_eq!(m.matches("a/b"), false);
}
#[test]
fn test_literal_negative() {
let m = TreeMatcher::from_rules(["a/**", "!a/b/**", "a/b/c/**"].iter()).unwrap();
assert_eq!(m.match_recursive("a"), None);
assert_eq!(m.match_recursive("a/c"), Some(true));
assert_eq!(m.match_recursive("a/b"), None);
assert_eq!(m.match_recursive("a/b/d"), Some(false));
assert_eq!(m.match_recursive("a/b/c"), Some(true));
assert_eq!(m.matches("a"), true);
assert_eq!(m.matches("a/b"), false);
assert_eq!(m.matches("a/b/c/d"), true);
assert_eq!(m.matches("a/b/d"), false);
assert_eq!(m.matches("a/c"), true);
assert_eq!(m.matches("z"), false);
}
#[test]
fn test_negative_override() {
let m = TreeMatcher::from_rules(["a/**", "!a/**", "!b/**", "b/**"].iter()).unwrap();
assert_eq!(m.match_recursive("a/b"), Some(false));
assert_eq!(m.match_recursive("b/c"), Some(true));
assert_eq!(m.matches("a"), false);
assert_eq!(m.matches("b"), true);
}
#[test]
fn test_mixed_negative_literal_simple_glob() {
let m =
TreeMatcher::from_rules(["a*/**", "!a1/**", "a1/a/**", "!a1/a*c/**"].iter()).unwrap();
assert_eq!(m.match_recursive("b"), Some(false));
assert_eq!(m.match_recursive("a1/a"), Some(true));
assert_eq!(m.matches("a"), true);
assert_eq!(m.matches("a1"), false);
assert_eq!(m.matches("a1/a"), true);
assert_eq!(m.matches("a1/b"), false);
assert_eq!(m.matches("a1/a1c"), false);
assert_eq!(m.matches("a2"), true);
assert_eq!(m.matches("b"), false);
}
#[test]
fn test_fast_paths() {
// Some interesting fast paths
let m = TreeMatcher::from_rules(["a/**/b/**"].iter()).unwrap();
assert_eq!(m.match_recursive("a/b"), Some(true));
assert_eq!(m.match_recursive("a/1/2/3/b"), Some(true));
let m = TreeMatcher::from_rules(["a/**/b/**", "!a/**/b/*/**"].iter()).unwrap();
assert_eq!(m.match_recursive("a/b/1"), Some(false));
assert_eq!(m.match_recursive("a/1/2/3/b/2"), Some(false));
}
#[test]
fn test_non_recursive_patterns() {
let m = TreeMatcher::from_rules(["a/*"].iter()).unwrap();
assert!(m.matches("a/a"));
assert!(!m.matches("b/a"));
assert!(!m.matches("a"));
assert_eq!(m.match_recursive("a"), None);
assert_eq!(m.match_recursive("a/b"), None);
assert_eq!(m.match_recursive("a/b/c"), Some(false));
assert_eq!(m.match_recursive("b"), Some(false));
let m = TreeMatcher::from_rules(["*a"].iter()).unwrap();
assert!(m.matches("aa"));
assert!(!m.matches("aa/b"));
assert!(!m.matches("b"));
assert_eq!(m.match_recursive("aa"), None);
assert_eq!(m.match_recursive("a/a"), Some(false));
assert_eq!(m.match_recursive("b"), Some(false));
let m = TreeMatcher::from_rules(["b*/**/*a"].iter()).unwrap();
assert!(m.matches("b1/aa"));
assert!(!m.matches("c/aa"));
assert!(!m.matches("b1/aa/11"));
assert!(!m.matches("b/a/b"));
assert!(m.matches("b/a/b/a"));
assert_eq!(m.match_recursive("aa"), Some(false));
assert_eq!(m.match_recursive("b/a/b"), None);
assert_eq!(m.match_recursive("b/a/b/a"), None);
}
#[test]
fn test_next_path_separator() {
assert_eq!(next_path_separator(b"/a/b", 0), Some(0));
assert_eq!(next_path_separator(b"/a/b", 1), Some(2));
assert_eq!(next_path_separator(b"/a/b", 2), Some(2));
assert_eq!(next_path_separator(b"/a/b", 3), None);
assert_eq!(next_path_separator(b"[/]a\\/b", 0), None);
assert_eq!(next_path_separator(b"\\[/]a", 0), Some(2));
}
#[test]
fn test_t62872044() {
let patterns = ["aaandr/l/aaj/**", "xplat/hermes/**"];
let m1 = TreeMatcher::from_rules(patterns.iter()).unwrap();
let m2 = TreeMatcher::from_rules(patterns.iter().rev()).unwrap();
assert!(m2.matches("aaandr/l/aaj/x"));
// FIXME: This can fail with globset 0.4.3. Bisected to:
// https://github.com/BurntSushi/aho-corasick/commit/063ca0d253b1cfcef9f6b5533bee0d1fd2737c33
assert!(m1.matches("aaandr/l/aaj/x"));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment