Skip to content

Instantly share code, notes, and snippets.

@Lee-W
Created November 15, 2016 03:01
Show Gist options
  • Save Lee-W/f2b9672a6d906348a20dfa75934c2998 to your computer and use it in GitHub Desktop.
Save Lee-W/f2b9672a6d906348a20dfa75934c2998 to your computer and use it in GitHub Desktop.
import argparse
from urllib.parse import urljoin
import requests
import json
from bs4 import BeautifulSoup
BASE_URL = 'http://facemood.grtimed.com/'
INDEX_URL = urljoin(BASE_URL, 'index.php')
CATE_RANGE = 15
def parse_facemood_item(soup):
facemood_text = soup.find('div', class_='facemoodItemText')['data-f-text']
facemood_tag = {tag_soup.text.strip() for tag_soup in soup.find_all('div', class_='faceMatchTagItem')}
return {facemood_text: facemood_tag}
def parse_facemood_page(soup):
page_facemood = dict()
for facemood_item_soup in soup.find_all('div', class_='facemoodItem'):
page_facemood.update(parse_facemood_item(facemood_item_soup))
return page_facemood
def parse_facemood_category_page(soup):
facemoods = dict()
menu_soup = soup.find('div', class_='facemoodPageJumpMenu')
for a_soup in menu_soup.find_all('a'):
url = urljoin(BASE_URL, a_soup['href'])
print('\tParsing {}'.format(url))
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html5lib')
facemoods.update(parse_facemood_page(soup))
return facemoods
def main(output_file_name):
all_facemoods = dict()
for i in range(CATE_RANGE):
payload = {
'view': 'facemood',
'cateid': str(i+1)
}
print('Parsing category {}'.format(i+1))
req = requests.get(INDEX_URL, params=payload)
soup = BeautifulSoup(req.text, 'html5lib')
all_facemoods.update(parse_facemood_category_page(soup))
all_facemoods_json = {k: list(v) for k, v in all_facemoods.items()}
with open(output_file_name, 'w') as output_file:
json.dump(all_facemoods_json, output_file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output-path', default='facemood.json')
args = parser.parse_args()
main(args.output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment