Skip to content

Instantly share code, notes, and snippets.

@dev-jonghoonpark
Last active July 23, 2023 16:40
Show Gist options
  • Save dev-jonghoonpark/7a7532cefac46fb0f50dfb18d28a7b22 to your computer and use it in GitHub Desktop.
Save dev-jonghoonpark/7a7532cefac46fb0f50dfb18d28a7b22 to your computer and use it in GitHub Desktop.
dcinside crawling with playwright python
import re
import time
import json
import pathlib
import nest_asyncio
nest_asyncio.apply()
import asyncio
from playwright.async_api import async_playwright, expect
from telegram import Bot
async def set_extra_http_headers(page):
await page.set_extra_http_headers({
"Connection" : "keep-alive",
"Cache-Control" : "max-age=0",
"sec-ch-ua-mobile" : "?0",
"DNT" : "1",
"Upgrade-Insecure-Requests" : "1",
"User-Agent" : "YOUR_USER_AGENT",
"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site" : "none",
"Sec-Fetch-Mode" : "navigate",
"Sec-Fetch-User" : "?1",
"Sec-Fetch-Dest" : "document",
"Accept-Encoding" : "gzip, deflate, br",
"Accept-Language" : "ko-KR,ko;q=0.9",
"Referer": "https://gall.dcinside.com/"
})
return page
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch_persistent_context(
executable_path="/usr/bin/google-chrome-stable",
user_data_dir="/content/random-user"
)
page = await browser.new_page()
page = await set_extra_http_headers(page)
# example : 식물 갤러리
await page.goto(url="https://gall.dcinside.com/board/lists/?id=tree", wait_until = "domcontentloaded")
time.sleep(10)
latest_id_pointer = 99999;
with open("{0}/{1}".format(pathlib.Path(__file__).parent.resolve(), "latest_id_pointer.json")) as json_file:
latest_id_pointer = json.load(json_file)["latest_id_pointer"];
new_articles = []
for tr in await page.locator("table.gall_list tbody tr.us-post").all():
num = await tr.locator("td.gall_num").text_content()
num = int(num)
if num <= latest_id_pointer:
continue
a = tr.locator("td.gall_tit a:first-child")
url = await a.get_attribute("href")
title = await a.text_content()
title = title.strip()
new_articles.append({"num": num, "title": title, "url": url})
await page.close()
if len(new_articles) == 0:
return
bot = Bot(token='YOUR_BOT_TOKEN')
try:
for article in reversed(new_articles):
page = await browser.new_page()
page = await set_extra_http_headers(page)
article_url = "https://gall.dcinside.com{0}".format(article["url"])
await page.goto(article_url, wait_until = "domcontentloaded")
time.sleep(10)
message = "[{0}] {1}\n{2}".format(article["num"], article["title"], article_url)
# hide unnecessary content
await page.evaluate('() => document.querySelectorAll("iframe").forEach(iframe => iframe.setAttribute("style", "display: none"))')
await page.evaluate('() => document.querySelector("#container > section > article:last-child").setAttribute("style", "display: none")')
buffer = await page.locator("#container").screenshot();
# send screenshot to you
await bot.send_document(chat_id=YOUR_CHAT_ID, document=buffer, caption=message, filename="screenshot_{0}.png".format(article["num"]))
time.sleep(5)
await page.close();
with open("{0}/{1}".format(pathlib.Path(__file__).parent.resolve(), "latest_id_pointer.json"), "w") as json_file:
json.dump({"latest_id_pointer": article["num"]}, json_file)
except Exception as e:
await bot.send_message(chat_id=YOUR_CHAT_ID, text=str(e))
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment