Skip to content

Instantly share code, notes, and snippets.

@sameerkumar18
Last active December 28, 2021 10:21
Show Gist options
  • Save sameerkumar18/6f54b2064237b1614e21965a7b236166 to your computer and use it in GitHub Desktop.
Save sameerkumar18/6f54b2064237b1614e21965a7b236166 to your computer and use it in GitHub Desktop.
Export Wix Blogs to CSV - No API needed.
responses = []
WIX_SITE_URL = 'https://www.YOUR WIX SITE.com'
import xmltodict
import requests
import json
import csv
def get_blog_posts_urls():
url = f'{WIX_SITE_URL}/blog-posts-sitemap.xml'
res = requests.get(url)
raw = xmltodict.parse(res.text)
blog_urls = [url['loc'] for url in dict(raw)['urlset']['url']]
print(blog_urls)
return blog_urls
blog_urls = get_blog_posts_urls()
def _remove_attrs(soup):
for tag in soup.find_all(True):
attrs = dict(tag.attrs)
for attr in attrs:
if 'data-hook' not in attr and 'data-id' not in attr and 'src' not in attr and 'href' not in attr:
del tag.attrs[attr]
elif (attrs.get('type') and attrs['type'] == 'empty-line') or (attrs.get('data-hook') and 'rcv-' in attrs['data-hook']):
tag.extract()
return soup
from bs4 import BeautifulSoup
import time
for URL in blog_urls:
time.sleep(1)
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
post_thumbnail = soup.find_all(attrs={"property": "og:image"})
post_thumbnail_url = post_thumbnail[0]['content']
post_description = soup.find_all(attrs={"property": "og:description"})
post_description_text = post_description[0]['content']
soup = _remove_attrs(soup)
# print(soup)
post_title = soup.find_all(attrs={"data-hook": "post-title"})
post_title_text = post_title[0].text
post_author = soup.find_all(attrs={"data-hook": "user-name"})
post_author_text = post_author[0].text
post_date = soup.find_all(attrs={"data-hook": "time-ago"})
post_date_text = post_date[0].text
post_content = soup.find_all(attrs={"data-id": "rich-content-viewer"})
post_content_html = str(post_content[0])
post_categories = soup.find_all(attrs={"data-hook": "category-label-list__item"})
post_categories = ','.join([category.text for category in post_categories])
print(post_author_text)
print(post_date_text)
responses.append({
'post_title': post_title_text,
'post_author': post_author_text,
'post_date': post_date_text,
'post_content_html': post_content_html,
'post_thumbnail': post_thumbnail_url,
'post_description': post_description_text,
'post_url': URL,
'post_categories': post_categories
})
print(responses[0])
keys = responses[0].keys()
with open('data.csv', 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(responses)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment