Last active
July 23, 2023 16:40
-
-
Save dev-jonghoonpark/7a7532cefac46fb0f50dfb18d28a7b22 to your computer and use it in GitHub Desktop.
dcinside crawling with playwright python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import time | |
import json | |
import pathlib | |
import nest_asyncio | |
nest_asyncio.apply() | |
import asyncio | |
from playwright.async_api import async_playwright, expect | |
from telegram import Bot | |
async def set_extra_http_headers(page): | |
await page.set_extra_http_headers({ | |
"Connection" : "keep-alive", | |
"Cache-Control" : "max-age=0", | |
"sec-ch-ua-mobile" : "?0", | |
"DNT" : "1", | |
"Upgrade-Insecure-Requests" : "1", | |
"User-Agent" : "YOUR_USER_AGENT", | |
"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", | |
"Sec-Fetch-Site" : "none", | |
"Sec-Fetch-Mode" : "navigate", | |
"Sec-Fetch-User" : "?1", | |
"Sec-Fetch-Dest" : "document", | |
"Accept-Encoding" : "gzip, deflate, br", | |
"Accept-Language" : "ko-KR,ko;q=0.9", | |
"Referer": "https://gall.dcinside.com/" | |
}) | |
return page | |
async def main(): | |
async with async_playwright() as p: | |
browser = await p.chromium.launch_persistent_context( | |
executable_path="/usr/bin/google-chrome-stable", | |
user_data_dir="/content/random-user" | |
) | |
page = await browser.new_page() | |
page = await set_extra_http_headers(page) | |
# example : 식물 갤러리 | |
await page.goto(url="https://gall.dcinside.com/board/lists/?id=tree", wait_until = "domcontentloaded") | |
time.sleep(10) | |
latest_id_pointer = 99999; | |
with open("{0}/{1}".format(pathlib.Path(__file__).parent.resolve(), "latest_id_pointer.json")) as json_file: | |
latest_id_pointer = json.load(json_file)["latest_id_pointer"]; | |
new_articles = [] | |
for tr in await page.locator("table.gall_list tbody tr.us-post").all(): | |
num = await tr.locator("td.gall_num").text_content() | |
num = int(num) | |
if num <= latest_id_pointer: | |
continue | |
a = tr.locator("td.gall_tit a:first-child") | |
url = await a.get_attribute("href") | |
title = await a.text_content() | |
title = title.strip() | |
new_articles.append({"num": num, "title": title, "url": url}) | |
await page.close() | |
if len(new_articles) == 0: | |
return | |
bot = Bot(token='YOUR_BOT_TOKEN') | |
try: | |
for article in reversed(new_articles): | |
page = await browser.new_page() | |
page = await set_extra_http_headers(page) | |
article_url = "https://gall.dcinside.com{0}".format(article["url"]) | |
await page.goto(article_url, wait_until = "domcontentloaded") | |
time.sleep(10) | |
message = "[{0}] {1}\n{2}".format(article["num"], article["title"], article_url) | |
# hide unnecessary content | |
await page.evaluate('() => document.querySelectorAll("iframe").forEach(iframe => iframe.setAttribute("style", "display: none"))') | |
await page.evaluate('() => document.querySelector("#container > section > article:last-child").setAttribute("style", "display: none")') | |
buffer = await page.locator("#container").screenshot(); | |
# send screenshot to you | |
await bot.send_document(chat_id=YOUR_CHAT_ID, document=buffer, caption=message, filename="screenshot_{0}.png".format(article["num"])) | |
time.sleep(5) | |
await page.close(); | |
with open("{0}/{1}".format(pathlib.Path(__file__).parent.resolve(), "latest_id_pointer.json"), "w") as json_file: | |
json.dump({"latest_id_pointer": article["num"]}, json_file) | |
except Exception as e: | |
await bot.send_message(chat_id=YOUR_CHAT_ID, text=str(e)) | |
asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment