Skip to content

Instantly share code, notes, and snippets.

@wonsolution
Last active September 10, 2020 03:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wonsolution/d1eb34a492dcce4142f054b617ecb997 to your computer and use it in GitHub Desktop.
Save wonsolution/d1eb34a492dcce4142f054b617ecb997 to your computer and use it in GitHub Desktop.
crawling/scraping
from urllib.request import urlopen
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
import time
#dafont에서 폰트 url가져오기
n=1
while 1:
url= 'https://www.dafont.com/new.php?page={}&nup=3'.format(str(n))
#특정페이지부터 시작할 경우 : .format(str(324+n)) -->325페이지부터 시작
for s in url:
url
n += 1
driver = webdriver.Chrome()
driver.get(url)
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html)
dafont = soup.select('.preview')
for i in dafont:
fList = 'https://www.dafont.com/' + i.a['href'] +('\n')
print(fList)
# print()
# 파일에 덮어쓰지 않고, 이어쓰는 경우 'w' 옵션 대신, 'a' 옵션을 주면 된다.
f = open("dafont.txt","a")
f.write(fList)
f.close()
driver.close()
time.sleep(60)
from urllib.request import urlopen
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import datetime
while 1:
# while문에 적합하지 않음.
# baseUrl ='https://www.instagram.com/quanhaha79'
# pUrl = input('좌표 : ')
# url = baseUrl +pUrl
# 반복문일 경우 바로 접속 할 수 있도록 변수를 넣지않고...
# 크롤링을 하고 싶은 사람 혹은 페이지의 주소/ID를 알 경우
url = 'https://www.instagram.com/quanhaha79'
# 키워드를 검색해 찾아 올 때
pUrl = 'quanhaha79'
# url = 'https://www.instagram.com/explore/tags/{}/'.format(pUrl)
driver = webdriver.Chrome()
driver.get(url)
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html)
insta = soup.select('.v1Nh3.kIKUG._bz0w')
n = 1
nowTime = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
for i in insta:
print('https://www.instagram.com/'+i.a['href'])
imgUrl = i.select_one('.KL4Bh').img['src']
with urlopen(imgUrl) as f:
with open('./haha/' + nowTime + '_' + pUrl + str(n) + '.jpg', 'wb') as h:
img = f.read()
h.write(img)
n +=1
print(imgUrl)
print()
driver.close()
print('완료')
time.sleep(60)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment