Crawl DxOMark Mobile (smartphones camera benchmark) data:로-내게-맞는-스마트폰-카메라-찾기/
import re
import requests
import datetime as dt
from bs4 import BeautifulSoup
# Crawl DxOMark Mobile data
class DxOCrawler():
def __init__(self):
self.baseurl = ""
self.links = [] = dict()
def get(self):
for i in range(1, 12):
page = requests.get(self.baseurl + str(i)).content
soup = BeautifulSoup(page)
divs = soup.div
self.links += re.findall('<a class="plain" href=.+"', str(divs))
self.links = [link[23:-1] for link in self.links]
for link in self.links:
article = requests.get(link).content
soup = BeautifulSoup(article)
model = soup.find_all("div", attrs={"class": "scoreType photo"})[0].string
if model == "\n photo ": model = " ".join(soup.title.string.split()[:6])
print model
photo, video = soup.find_all("div", attrs={"class": "anychart"})
photo = map(int, photo["data-array"].split(","))
video = map(int, video["data-array"].split(","))
photo = photo["data-array"].split(",")
photo = map(int, photo[:-1]) + [photo[-1]]
video = map(int, video["data-array"].split(","))[model] = photo + video
print photo + video
except IndexError:
print link
if __name__ == "__main__":
crawler = DxOCrawler()
with open("DxOMark_mobile_{}.txt".format(str(, "w") as w:
for k, v in
v_tabbed = "\t".join(v)
w.write("{}\t{}\n".format(k, v_tabbed)
