Created
May 6, 2024 13:58
-
-
Save guillaumemolter/4f666878710c6fc6c4a19a5ad667a5f7 to your computer and use it in GitHub Desktop.
A python3 run to run on MacOS to extract links from a Word document
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Requirements | |
# pip3 install python-docx | |
# pip3 install xlsxwriter | |
# | |
# Run the script | |
# python3 extract-links.py | |
from docx import Document | |
import urllib.parse | |
import re | |
import xlsxwriter | |
import requests | |
# Return the HTTP response code as an integer for a given link | |
def check_link_status(link): | |
try: | |
response = requests.head(link) | |
return response.status_code | |
except: | |
return 500 | |
# Return an array of links from a given Word document path | |
def extract_links(doc_path): | |
document = Document(doc_path) | |
rels = document.part.rels | |
links = [] | |
for rel in rels: | |
if "hyperlink" in rels[rel].reltype: | |
url = rels[rel]._target | |
links.append(url) | |
return links | |
# Write on seperate lines of a given text file path, the links contained into a given array of links. | |
def write_links_to_file(links, file_path): | |
links.sort() | |
with open(file_path, 'w') as f: | |
for link in links: | |
f.write(link + '\n') | |
# Check how many subfolder a give URL string has. | |
def get_subfolder_count(url): | |
subfolder_pattern = r"/[^/]+" | |
match = re.findall(subfolder_pattern, url) | |
return len(match) | |
# Write on seperate rows of a given excel worksheet, the links contained into a given array of links. | |
def write_to_excel(worksheet,links): | |
row = 0 | |
for link in links: | |
worksheet.write(row, 0, link) | |
row += 1 | |
return worksheet | |
def main(): | |
doc_path = 'report.docx' | |
hsph_links_path = 'hsph.txt' | |
hsph_top_level_path = 'hsph_top_level.txt' | |
hsph_multi_level_path = 'hsph_multi_level.txt' | |
other_links_path = 'others.txt' | |
broken_path = 'broken.txt' | |
redirecting_path = 'redirecting.txt' | |
workbook = xlsxwriter.Workbook('links.xlsx') | |
summary_worksheet = workbook.add_worksheet('Summary') | |
raw_worksheet = workbook.add_worksheet('Raw links') | |
hsph_worksheet = workbook.add_worksheet('All Unique HSPH links') | |
hsph_top_worksheet = workbook.add_worksheet('Unique HSPH Top links') | |
hsph_multi_worksheet = workbook.add_worksheet('Unique HSPH Deep links') | |
other_worksheet = workbook.add_worksheet('External') | |
redirecting_worksheet = workbook.add_worksheet('Redirections') | |
broken_worksheet = workbook.add_worksheet('Broken') | |
# Get all links | |
links = extract_links(doc_path) | |
print(f"Total links extracted: {len(links)}") | |
summary_row = 0 | |
summary_worksheet.write(summary_row, 0, f"Total links extracted: {len(links)}") | |
summary_row+= 1 | |
write_to_excel(raw_worksheet,links) | |
# Extract main website links | |
hsph_links = list(set([link for link in links if 'www.hsph' in link])) | |
hsph_top_level_links = [] | |
hsph_multi_level_links = [] | |
for link in hsph_links: | |
if get_subfolder_count(link) > 2: | |
hsph_multi_level_links.append(link) | |
else: | |
hsph_top_level_links.append(link) | |
write_links_to_file(hsph_links, hsph_links_path) | |
write_to_excel(hsph_worksheet,hsph_links) | |
print(f"Unique HSPH links: {len(hsph_links)}") | |
summary_worksheet.write(summary_row, 0, f"Unique HSPH links: {len(hsph_links)}") | |
summary_row+= 1 | |
write_links_to_file(hsph_top_level_links, hsph_top_level_path) | |
write_to_excel(hsph_top_worksheet,hsph_top_level_links) | |
print(f"Unique HSPH top level links: {len(hsph_top_level_links)}") | |
summary_worksheet.write(summary_row, 0, f"Unique HSPH top level links: {len(hsph_top_level_links)}") | |
summary_row+= 1 | |
write_links_to_file(hsph_multi_level_links, hsph_multi_level_path) | |
write_to_excel(hsph_multi_worksheet,hsph_multi_level_links) | |
print(f"Unique HSPH deep links: {len(hsph_multi_level_links)}") | |
summary_worksheet.write(summary_row, 0, f"Unique HSPH deep links: {len(hsph_multi_level_links)}") | |
summary_row+= 1 | |
# Extract other domains links | |
other_links = list(set([link for link in links if link not in hsph_links])) | |
write_links_to_file(other_links, other_links_path) | |
write_to_excel(other_worksheet,other_links) | |
summary_worksheet.write(summary_row, 0, f"Unique external links: {len(other_links)}") | |
summary_row+= 1 | |
print(f"Unique external links: {len(other_links)}") | |
# Check links | |
redirecting_links = [] | |
broken_links = [] | |
redirecting_row = 0 | |
broken_row = 0 | |
for link in links: | |
code = check_link_status(link) | |
if 300 <= code < 400: | |
redirecting_links.append(link + ' code: ' + str(code)) | |
redirecting_worksheet.write(redirecting_row, 0, link) | |
redirecting_worksheet.write(redirecting_row, 1, str(code)) | |
redirecting_row+= 1 | |
if 400 <= code < 500: | |
broken_links.append(link + ' code: ' + str(code)) | |
broken_worksheet.write(broken_row, 0, link) | |
broken_worksheet.write(broken_row, 1, str(code)) | |
broken_row+= 1 | |
write_links_to_file(redirecting_links, redirecting_path) | |
print(f"Redirecting links: {len(redirecting_links)}") | |
summary_worksheet.write(summary_row, 0, f"Redirecting links: {len(redirecting_links)}") | |
summary_row+= 1 | |
write_links_to_file(broken_links, broken_path) | |
print(f"Broken links: {len(broken_links)}") | |
summary_worksheet.write(summary_row, 0, f"Broken links: {len(broken_links)}") | |
summary_row+= 1 | |
workbook.close() | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This was mostly written using Claude AI, this is a "quick and dirty" document analysis and is not meant to be production code.