Skip to content

Instantly share code, notes, and snippets.

@naturale0
Last active November 18, 2020 02:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save naturale0/46054738f4f61916d6399058c58fafdc to your computer and use it in GitHub Desktop.
Save naturale0/46054738f4f61916d6399058c58fafdc to your computer and use it in GitHub Desktop.
Crawl DxOMark Mobile (smartphones camera benchmark) data: https://de-novo.org/2017/12/29/efa로-내게-맞는-스마트폰-카메라-찾기/
import re
import requests
import datetime as dt
from bs4 import BeautifulSoup
# Crawl DxOMark Mobile data
class DxOCrawler():
def __init__(self):
self.baseurl = "https://www.dxomark.com/category/mobile-reviews/page"
self.links = []
self.data = dict()
def get(self):
for i in range(1, 12):
page = requests.get(self.baseurl + str(i)).content
soup = BeautifulSoup(page)
divs = soup.div
self.links += re.findall('<a class="plain" href=.+"', str(divs))
self.links = [link[23:-1] for link in self.links]
for link in self.links:
#time.sleep(0.1)
article = requests.get(link).content
soup = BeautifulSoup(article)
try:
model = soup.find_all("div", attrs={"class": "scoreType photo"})[0].string
if model == "\n photo ": model = " ".join(soup.title.string.split()[:6])
print model
photo, video = soup.find_all("div", attrs={"class": "anychart"})
try:
photo = map(int, photo["data-array"].split(","))
video = map(int, video["data-array"].split(","))
except:
photo = photo["data-array"].split(",")
photo = map(int, photo[:-1]) + [photo[-1]]
video = map(int, video["data-array"].split(","))
self.data[model] = photo + video
print photo + video
except IndexError:
print link
if __name__ == "__main__":
crawler = DxOCrawler()
crawler.get()
with open("DxOMark_mobile_{}.txt".format(str(dt.datetime.today().date())), "w") as w:
for k, v in crawler.data.items():
v_tabbed = "\t".join(v)
w.write("{}\t{}\n".format(k, v_tabbed)
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment