Last active
July 2, 2018 17:17
-
-
Save suminb/53681b007ba28b7d3a760a437cd92a60 to your computer and use it in GitHub Desktop.
Demo for a lecture at WISET (2018-05-11)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""웹브라우저 띄워서 가져오는 예제 (느림)""" | |
import csv | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
driver = webdriver.Firefox() | |
def main(): | |
driver.get('https://www.google.com?hl=en') | |
search = driver.find_element_by_xpath("//input[@title='Search']") | |
search.send_keys('한국여성과학인지원센터') | |
search.send_keys(Keys.RETURN) | |
input('Move to News section ') | |
news = driver.find_element_by_xpath("//a[text()='News']") | |
news.click() | |
input('Collect titles ') | |
titles = driver.find_elements_by_xpath("//div[@class='g']/div/div/h3/a") | |
for title in titles: | |
print(title.get_attribute('text')) | |
save_as_file(titles, 'results.csv') | |
def save_as_file(results, filename): | |
with open(filename, 'w') as fout: | |
writer = csv.writer( | |
fout, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
writer.writerow(['Title', 'Link']) | |
for result in results: | |
row = [result.get_attribute('text'), result.get_attribute('href')] | |
writer.writerow(row) | |
if __name__ == '__main__': | |
try: | |
main() | |
finally: | |
driver.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""아무런 창도 띄우지 않고 바로 가져오는 예제""" | |
import csv | |
from urllib.parse import quote | |
from bs4 import BeautifulSoup | |
import requests | |
def process(keyword, filename): | |
url = 'https://www.google.co.kr/search' | |
params = { | |
'tbm': 'nws', | |
'source': 'lnms', | |
'prmd': 'inmv', | |
'sa': 'X', | |
# 'biw': 400, | |
# 'bih': 616, | |
# 'dpr': 2, | |
# 'rlz': '1C5CHFA_enKR728KR730', | |
# 'ved': '0ahUKEwjovsKn7oDcAhWBd94KHXaEC3gQ_AUIEigC', | |
'q': keyword, | |
} | |
headers = { | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', | |
} | |
resp = requests.get(url, params=params, headers=headers) | |
soup = BeautifulSoup(resp.content) | |
headers = soup.select('h3.r a') | |
save_as_file(headers, filename) | |
def save_as_file(headers, filename): | |
with open(filename, 'w') as fout: | |
writer = csv.writer( | |
fout, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
writer.writerow(['Title', 'Link']) | |
for header in headers: | |
row = [header.text, header.get('href')] | |
writer.writerow(row) | |
if __name__ == '__main__': | |
process('한국여성과학기술인지원센터', 'results.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment