Last active
October 18, 2022 02:31
-
-
Save erikdstock/a9ed9915a93c3f6427fdaa5a41f6821a to your computer and use it in GitHub Desktop.
Extract things that look like apa citations to a csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# naive script to read a text file and | |
# extract what look like apa citations from parentheses, | |
# then split them on `;` to pull out groups of citations | |
# and finally write them to a text file. | |
# usage: `python extractor.py ./some-text.txt` | |
import sys | |
import re | |
def main(): | |
print("Extracting citations...") | |
filename = sys.argv[1] | |
process_file(filename) | |
def process_file(filename): | |
with open(filename) as file: | |
text = file.read().replace('\n', ' ') | |
all_parens_groups = re.findall("\(([A-Z][^)]+)\)", text) | |
# More permissive version | |
# all_parens_groups = re.findall("\(([^)]+)\)", text) | |
nested_citations = [re.split("; ", group) | |
for group in all_parens_groups] | |
# https://stackoverflow.com/questions/25674169/how-does-the-list-comprehension-to-flatten-a-python-list-work | |
flattened_citations = [ | |
item for sublist in nested_citations for item in sublist] | |
unique_citations = set(flattened_citations) | |
sorted_citations = sorted(unique_citations) | |
with open('output.txt', 'w') as f: | |
for citation in sorted_citations: | |
f.write(citation + "\n") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment