Last active
March 23, 2021 13:55
-
-
Save 16Yongjin/82d37938042269ce5d70b5271ca579b8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from instagram_private_api import Client, ClientCompatPatch | |
from pymongo import MongoClient | |
from pprint import pprint | |
from datetime import date | |
from time import sleep | |
from random import random | |
def safeget(dct, *keys): | |
for key in keys: | |
try: | |
dct = dct[key] | |
except KeyError: | |
return None | |
except TypeError: | |
return None | |
return dct | |
def format_timestamp(timestamp): | |
d = date.fromtimestamp(1614906818) | |
return d.strftime('%Y-%m-%dT%H:%M:%S') | |
def extract_item(item): | |
user_id = safeget(item, 'user', 'pk') | |
# 글 없이 사진만 올려놓고 댓글에 해시태그 다는 경우: 해시태그가 있는 댓글 가져오기 | |
comments_with_tags = ''.join(map(lambda x: safeget(x, 'text'), (filter(lambda c: safeget(c, 'user', 'pk') == user_id and '#' in c.get('text', ''), item.get('preview_comments', []))))) | |
return { | |
'text': safeget(item, 'caption', 'text') or comments_with_tags, | |
'created_at': item['taken_at'], | |
'created_at_iso': format_timestamp(item['taken_at']), | |
'user_id': user_id, | |
'caption_pk': safeget(item, 'caption', 'pk'), | |
'id': item['pk'], | |
'like_count': item['like_count'], | |
'comment_count': item['comment_count'], | |
'location': item.get('location', None), | |
'next_max_id': item.get('next_max_id', None) | |
} | |
def extract_items(items): | |
return list(map(extract_item, map(lambda x: x['media'], items))) | |
user_name = '' # 아이디 | |
password = '' # 비밀번호 | |
# 몽고 DB 접속 | |
client = MongoClient('mongodb://localhost:27017/') | |
db = client.instagram | |
collection = db.tag_section | |
api = Client(user_name, password) | |
next_media_ids = [] # 페이지네이션 아이디 | |
for i in range(10000): | |
results = api.tag_section('먹스타그램', tab='recent', next_media_ids=next_media_ids) # 태그 섹션 검색 | |
items = extract_items(results['sections'][0]['layout_content']['medias']) # 필요한 정보만 뽑기 | |
next_media_ids = list(filter(lambda x: x is not None, map(lambda x: x['next_max_id'], items))) # 페이지네이션 아이디 뽑기 | |
collection.insert_many(items) # DB 저장 | |
print(i, 'saved', len(items)) | |
sleep(random() * 10 + 5) # 10 ~ 15초 쉬기 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment