Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
食べログEDA

食べログEDA

3.6点の壁は本当にあるのか

スクレイピングのコード

import os
import sys
import bs4
import requests
import time
import json
from hashlib import sha256
from pathlib import Path
import random
u = 'https://tabelog.com/tokyo/A1301/'

def scan(u, page, chi):
    hs = sha256(bytes(u, 'utf8')).hexdigest()[:16]
    print('try', u)
    if Path(f'comps/{hs}').exists():
        return
    headers = {'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'referer': u}
    r = requests.get(u, headers=headers)
    soup = bs4.BeautifulSoup(r.text)

    lsts = soup.find_all('li', {'class': 'list-rst'})
    objs = []
    for lst in lsts:
        print(lst.text.replace('\n', ''))
        t = lst.find('a', {'class': 'cpy-rst-name'})
        s = lst.find('span', {'class': 'list-rst__rating-val'})
        rn = lst.find('em', {'class': 'list-rst__rvw-count-num cpy-review-count'})
        mise = t.text
        '''
        点が入っていないことがあるためハンドル
        '''
        if s is None:
            continue

        score = s.text
        review_num = rn.text
        obj = {'mise':mise, 'score':score, 'page':page, 'chi':chi, 'review_num':review_num}
        objs.append(obj)
    json.dump(objs, fp=open(f'comps/{hs}', 'w'), ensure_ascii=False, indent=2)
    print('complete', u)
    time.sleep(6.0)
scan(u, 0, -1)
while True:
    page = random.sample(list(range(1, 100)) , 1).pop()
    chi = random.sample([k for k in range(31)], 1).pop()
    print(chi)
    scan(f'https://tabelog.com/tokyo/A13{chi:02d}/rstLst/{page}/', page, chi)

スクレイピングしたデータ

28783件

集計のコード

from collections import Counter
import glob
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt


def force_to_int(x):
    try:
        return int(x)
    except:
        return -1


objs = []
for fn in glob.glob('./comps/*'):
    try:
        arr = json.loads(open(fn).read())
    except Exception as exc:
        print(exc)
        continue
    for obj in arr:
        print(obj)
        objs.append(obj)

df = pd.DataFrame(objs)
df.drop_duplicates(subset=['mise'], keep='last', inplace=True)
df['score'] = df.score.apply(float)
df['review_num'] = df.review_num.apply(force_to_int)
df.to_csv('out.csv', index=None)

for th in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]:
    df1 = df[df.review_num >= th]
    plt.figure(figsize=(30, 10))

    d = Counter(df1.score.tolist())
    df2 = pd.DataFrame([{'score':k, 'freq':v} for k, v in d.items()])
    ax = sns.barplot(x='score', y='freq', data=df2)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    sns.set(font_scale=1.5)
    ax.set(xlabel='score', ylabel='freq', title=f'Hist over {th} in review_num, sample size {df1.shape[0]}')
    ax.figure.savefig(f'imgs/img_{th}.png')

結果

レビューの件数を何件で足切りするかでおおきく結果は変わってきそうである。

レビュー足切りなし

レビュー足切り30件

レビュー足切り90件

3.6、3.8以降の急激な頻出数の減少があり、何らかの意図が加わっていることが推察される。

@GINK03

This comment has been minimized.

Copy link
Owner Author

GINK03 commented Oct 11, 2019

image

image

image

image

@OsciiArt

This comment has been minimized.

Copy link

OsciiArt commented Oct 12, 2019

離散値データのhistogramでビニングをちゃんと指定してないのはまずいと思いました。
こんな感じでどうでしょう↓

"""
ビンはbins[i]以上 かつ bins[i+1]未満 でビニングする
無課金店舗が3.6上限にされているかを見るために
3.58より多い かつ 3.60以下 でビニングしたい
そのため0.05ごとに区切ったうえで+0.000001している。

"""
bin_width = 0.05
bins = np.arange(int(np.ceil((df['score'].max() - df['score'].min())/bin_width))+1)*bin_width + df['score'].min() + 0.000001
print(bins)
tick_width = 0.2
xticks = np.arange(int(np.ceil((df['score'].max() - df['score'].min())/tick_width))+1)*tick_width + df['score'].min()
for th in [0, 10, 20, 30, 40, 50]:
    df1 = df[df.review_num >= th]
    fig, ax = plt.subplots()
    df1[['score']].hist(bins=bins, ax=ax)
    plt.title('Hist over {} in review_num. size: {}'.format(th, df1.shape[0]))
    plt.xticks(xticks)

image
image
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.