erikdstock/extractor.py

## extractor.py
# naive script to read a text file and
# extract what look like apa citations from parentheses,
# then split them on `;` to pull out groups of citations
# and finally write them to a text file.
# usage: `python extractor.py ./some-text.txt`

import sys
import re


def main():
    print("Extracting citations...")
    filename = sys.argv[1]
    process_file(filename)


def process_file(filename):
    with open(filename) as file:
        text = file.read().replace('\n', ' ')

        all_parens_groups = re.findall("\(([A-Z][^)]+)\)", text)

        # More permissive version
        # all_parens_groups = re.findall("\(([^)]+)\)", text)

        nested_citations = [re.split("; ", group)
                            for group in all_parens_groups]

        # https://stackoverflow.com/questions/25674169/how-does-the-list-comprehension-to-flatten-a-python-list-work
        flattened_citations = [
            item for sublist in nested_citations for item in sublist]

        unique_citations = set(flattened_citations)
        sorted_citations = sorted(unique_citations)

        with open('output.txt', 'w') as f:
            for citation in sorted_citations:
                f.write(citation + "\n")


if __name__ == "__main__":
    main()
	# naive script to read a text file and
	# extract what look like apa citations from parentheses,
	# then split them on `;` to pull out groups of citations
	# and finally write them to a text file.
	# usage: `python extractor.py ./some-text.txt`

	import sys
	import re


	def main():
	print("Extracting citations...")
	filename = sys.argv[1]
	process_file(filename)


	def process_file(filename):
	with open(filename) as file:
	text = file.read().replace('\n', ' ')

	all_parens_groups = re.findall("\(([A-Z][^)]+)\)", text)

	# More permissive version
	# all_parens_groups = re.findall("\(([^)]+)\)", text)

	nested_citations = [re.split("; ", group)
	for group in all_parens_groups]

	# https://stackoverflow.com/questions/25674169/how-does-the-list-comprehension-to-flatten-a-python-list-work
	flattened_citations = [
	item for sublist in nested_citations for item in sublist]

	unique_citations = set(flattened_citations)
	sorted_citations = sorted(unique_citations)

	with open('output.txt', 'w') as f:
	for citation in sorted_citations:
	f.write(citation + "\n")


	if __name__ == "__main__":
	main()