-
-
Save lethain/20ae58ce576670f245920a4ab1993056 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import argparse | |
| import glob | |
| import re | |
| from collections import defaultdict | |
| # Regex patterns for markdown links and sentences | |
| link_pattern = re.compile(r'\[([^\]]+)\]\(([^\)]+)\)') | |
| sentence_pattern = re.compile(r'[^\n.!?]*\[[^\]]+\]\([^\)]+\)[^.!?]*[.!?]') | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description='Extract markdown links.') | |
| parser.add_argument('files_glob', help='Glob pattern to match markdown files.') | |
| parser.add_argument('--grouped', action='store_true', help='Group output by URL.') | |
| return parser.parse_args() | |
| def extract_links(file_path): | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| content = file.read() | |
| sentences = sentence_pattern.findall(content) | |
| links = [] | |
| for sentence in sentences: | |
| for match in link_pattern.findall(sentence): | |
| link_text, url = match | |
| links.append({ | |
| 'url': url, | |
| 'text': link_text, | |
| 'file': file_path, | |
| 'surrounding': sentence.replace('\n', ' ').replace(' ', ' ').strip() | |
| }) | |
| return links | |
| def standard_output(all_links): | |
| links_by_file = defaultdict(list) | |
| for link in all_links: | |
| links_by_file[link['file']].append(link) | |
| for i, (filename, links) in enumerate(links_by_file.items()): | |
| print(f'file {i}: {filename}') | |
| for j, link in enumerate(links): | |
| print(f" {j:04} url: {link['url']}") | |
| print(f" text: {link['text'].strip()}") | |
| print(f" surr: {link['surrounding']}\n") | |
| def grouped_output(all_links): | |
| grouped = defaultdict(list) | |
| for link in all_links: | |
| grouped[link['url']].append(link) | |
| # Sort by frequency | |
| sorted_grouped = sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True) | |
| for url, links in sorted_grouped: | |
| print(f'URL: {url} ({len(links)} occurrences)') | |
| for link in links: | |
| print(f" - text: {link['text']} (file: {link['file']})") | |
| print() | |
| def main(): | |
| args = parse_args() | |
| file_paths = glob.glob(args.files_glob) | |
| all_links = [] | |
| for file_path in file_paths: | |
| all_links.extend(extract_links(file_path)) | |
| if args.grouped: | |
| grouped_output(all_links) | |
| else: | |
| standard_output(all_links) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment