Created
January 8, 2022 11:38
-
-
Save antiproblemist/5fdb01621ad5e684f544d8b4433b7f50 to your computer and use it in GitHub Desktop.
Convert excel sheet with URLs to Sitemap files (.xml)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted. | |
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
""" | |
import pandas as pd | |
from lxml import etree | |
from tqdm import tqdm | |
from datetime import datetime | |
from math import ceil | |
link_col = "URLs" # Name of the column with the link | |
per_file_limit = 10000 # Number of URLs in each sitemap | |
try: | |
input_workbook_path = input('Enter the excel workbook path: ') | |
df = pd.read_excel(input_workbook_path, 'Sheet1', index_col=None, engine="openpyxl") | |
except Exception as e: | |
print("File error") | |
print(e) | |
count_lower_limit = 0 | |
count_higher_limit = per_file_limit | |
file_count = int(ceil(float(len(df.index)) / float(per_file_limit))) | |
for file_number in range(1, file_count + 1): | |
root = etree.Element('urlset', xmlns="http://www.sitemaps.org/schemas/sitemap/0.9") | |
for index, row in tqdm(df[count_lower_limit:count_higher_limit].iterrows(), total=len(df[count_lower_limit:count_higher_limit].index)): | |
try: | |
url = etree.Element("url") | |
loc = etree.Element("loc") | |
loc.text = row[link_col] | |
url.append(loc) | |
lastmod = etree.Element("lastmod") | |
lastmod_datetime = datetime.strftime(datetime.now(), '%Y-%m-%d') # Or get it from your worksheet row[lastmodified_col] | |
lastmod.text = lastmod_datetime | |
url.append(lastmod) | |
priority = etree.Element("priority") | |
priority.text = str(1) # Or get it from your worksheet row[priority_col] | |
url.append(priority) | |
changefreq = etree.Element("changefreq") | |
changefreq.text = "weekly" # Or get it from your worksheet row[changefreq_col] | |
url.append(changefreq) | |
root.append(url) | |
except Exception as e: | |
print(e) | |
continue | |
file_name = "sitemap-%s.xml" % file_number | |
file = open(file_name, 'wb') | |
file.write(etree.tostring(root, pretty_print=True, xml_declaration = True, encoding='UTF-8')) | |
file.close() | |
count_lower_limit += per_file_limit | |
count_higher_limit += per_file_limit | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment