Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
crawl img in line store and download them. using beautifulsoup4
#!/usr/bin/env python3
# Web Crawler for line stickers
# USAGE:
# $ python line_sticker_printer.py <url>
# example url:
# https://store.line.me/stickershop/home/user/zh-Hant
# https://store.line.me/stickershop/home/general/zh-Hant
#
# credit:
# jminh@github and hour of code
#
# Reference:
# main: http://pycontw.blogspot.tw/2015/12/hour-of-code-90.html
# code: https://github.com/jminh/hour_of_code_python_2015
# hackpad notes: https://hocpython.hackpad.com/Hour-of-Code-Python--oQL8j5m00dp
#
# Revised by D. Liu
from bs4 import BeautifulSoup as BS
import os
import requests
import sys
def download_file(url, dir='.'):
# NOTE filename is customize
# url example:
# http://.../products/0/0/1/1239040/LINEStorePC/thumbnail_shop.png
# http://.../products/0/0/1/1235900/LINEStorePC/thumbnail_shop.png
local_filename = url.split('/')[-3]
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(os.path.join(dir, local_filename), 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush() # commented by recommendation from J.F.Sebastian
return local_filename
def main():
stickers = []
# make request and get response
res = requests.get(sys.argv[1])
# parse txt content of response
# NOTE check response_snippet
soup = BS(res.text, 'html.parser')
# NOTE this way is customized to get div tags
divs = soup.find_all('div', 'mdCMN05Img')
# get image link from src attributes
stickers = [ div.img['src'] for div in divs if div.img['src'] ]
#for div in divs:
# src = div.img['src']
# if src:
# stickers.append(src)
# make folder named as webpage title
# get text content of <title>
title = soup.title.text.split('-')[0].strip()
if not os.path.exists(title):
os.mkdir(title)
# download image
download_dir = os.path.join(os.getcwd(), title)
for url in stickers:
print(download_file(url, download_dir), "downloaded!")
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: line_sticker_printer.py [URL]")
sys.exit(0)
main()
response_snippet = u"""
<div class="MdCMN02List">
<ul class="mdCMN02Ul">
<li class="mdCMN02Li">
<a href="/stickershop/product/1236945/zh-Hant" data-gcl="sticker|click|sticker_item_1236945">
<div class="MdCMN05Item mdCMN05Sticker">
<div class="mdCMN05Img">
<img src="https://sdl-stickershop.line.naver.jp/products/0/0/1/1236945/LINEStorePC/thumbnail_shop.png" height="120" width="120">
</div>
<p class="mdCMN05Ttl">title of the stickers</p>
</div></a>
</li>
<li class="mdCMN02Li">
<a href="/stickershop/product/1238025/zh-Hant" data-gcl="sticker|click|sticker_item_1238025">
<div class="MdCMN05Item mdCMN05Sticker">
<div class="mdCMN05Img">
<img src="https://sdl-stickershop.line.naver.jp/products/0/0/1/1238025/LINEStorePC/thumbnail_shop.png" height="120" width="120">
</div>
<p class="mdCMN05Ttl">title of the stickers</p>
</div></a>
</li>
<!--..More li tags here...-->
</ul>
</div>
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.