-
-
Save lethain/34187be3090a12b74f4bdaba8f4fd796 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Prompt: | |
| Write a python3 command line script named "links.py" that takes these parameters: | |
| 1. `files_glob` - a glob like "dir1/dir2/*.md" for files to be included in analysis | |
| 2. `--grouped` which changes output format to be in "grouped" format rather than "standard" format | |
| When run, this script should: | |
| 1. Create list of files matching `file_glob`. Assume that each of these files is in markdown format | |
| 2. For each file in that list, it should extract all the Markdown links, including the full sentence the link is included in. Each link should be represented as a dictionary with four keys: `url` is the URL from the link, `text` is the text for the link, "file" is the file the link was included in, and `surrounding` is the full sentence that includes the link | |
| 3. If running the script in "standard" format, then each of these links should be printed to standard out | |
| 4. If in grouped mode, build a dictionary of urls to links that use that url. For example, if the URL "/test/" is included in four links, then the dictionary should look like {"/test/": [{"text": "a"}, {"text": "b"}, {"text": "c"}, {"text": "d"}]. Then print each of those groups to standard out, ordered by most frequently used links first | |
| """ | |
| #!/usr/bin/env python3 | |
| import argparse | |
| import glob | |
| import re | |
| from collections import defaultdict | |
| # Regex patterns for markdown links and sentences | |
| link_pattern = re.compile(r'\[([^\]]+)\]\(([^\)]+)\)') | |
| sentence_pattern = re.compile(r'[^.!?]*\[[^\]]+\]\([^\)]+\)[^.!?]*[.!?]') | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description='Extract markdown links.') | |
| parser.add_argument('files_glob', help='Glob pattern to match markdown files.') | |
| parser.add_argument('--grouped', action='store_true', help='Group output by URL.') | |
| return parser.parse_args() | |
| def extract_links(file_path): | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| content = file.read() | |
| sentences = sentence_pattern.findall(content) | |
| links = [] | |
| for sentence in sentences: | |
| for match in link_pattern.findall(sentence): | |
| link_text, url = match | |
| links.append({ | |
| 'url': url, | |
| 'text': link_text, | |
| 'file': file_path, | |
| 'surrounding': sentence.strip() | |
| }) | |
| return links | |
| def standard_output(all_links): | |
| for link in all_links: | |
| print(link) | |
| def grouped_output(all_links): | |
| grouped = defaultdict(list) | |
| for link in all_links: | |
| grouped[link['url']].append(link) | |
| # Sort by frequency | |
| sorted_grouped = sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True) | |
| for url, links in sorted_grouped: | |
| print(f'URL: {url} ({len(links)} occurrences)') | |
| for link in links: | |
| print(f" - text: {link['text']} (file: {link['file']})") | |
| print() | |
| def main(): | |
| args = parse_args() | |
| file_paths = glob.glob(args.files_glob) | |
| all_links = [] | |
| for file_path in file_paths: | |
| all_links.extend(extract_links(file_path)) | |
| if args.grouped: | |
| grouped_output(all_links) | |
| else: | |
| standard_output(all_links) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment