Skip to content

Instantly share code, notes, and snippets.

@KnightChaser
Last active November 19, 2022 04:38
Show Gist options
  • Save KnightChaser/f0a15345d31cb1b11049d32bcedc4c32 to your computer and use it in GitHub Desktop.
Save KnightChaser/f0a15345d31cb1b11049d32bcedc4c32 to your computer and use it in GitHub Desktop.
구글 검색 결과 개수, 시간, 제목과 그 링크, 처리하는데 걸린 시간을 담아서 반환하는 코드. 구글 인덱스 파싱하는 코드가 안 보여서 직접 만들어 봄.
import requests
from bs4 import BeautifulSoup
import time
import re
import random
class GoogleParsingConst:
index_name_class_name = "LC20lb MBeuO DKV0Md"
index_url_class_name = "iUh30 qLRx3b tjvcx"
html_tag_del_regex = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
html_entity_del_regex = re.compile('&(?:[a-z\d]+|#\d+|#x[a-f\d]+)')
user_agent_string_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:106.0) Gecko/20100101 Firefox/106.0',
'Mozilla/5.0 (Windows NT 10.0; rv:106.0) Gecko/20100101 Firefox/106.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.52',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.35'
]
def get_google_search_index(keyword):
if keyword == None:
return False
start_time = time.time()
try:
url = f"https://www.google.com/search?q={keyword}"
headers = {"user-agent" : random.choice(GoogleParsingConst.user_agent_string_list)}
res = requests.get(url, headers = headers)
soup = BeautifulSoup(res.text, 'html.parser')
except Exception as e:
return f"exception occured : {e}"
result_stat = soup.select_one("#result-stats").text
result_count = (lambda data: int(data.split(' ')[2].replace('개', '').replace(',', '')))(result_stat) # how many results?
result_time = (lambda data: float(data.split(' ')[3].replace('(', '').replace(')', '').replace('초', '')))(result_stat) # how much time Google spent?
result_index_name_list = (lambda data: data.find_all("h3", {"class" : GoogleParsingConst.index_name_class_name}))(soup)
result_index_url_list = (lambda data: data.find_all("cite", {"class" : GoogleParsingConst.index_url_class_name}))(soup)
finish_time = time.time()
processing_time = finish_time - start_time
# making results as a dictionary
result = {}
result['search_keyword'] = keyword
result['search_result'] = result_count
result['search_time'] = result_time
result['processing_time'] = processing_time
result['index'] = {}
_seq = 0
for result_index_name, result_index_url in zip(result_index_name_list, result_index_url_list):
result_index_name = str(result_index_name)
result_index_url = str(result_index_url)
if _seq >= 5: # only TOP 5 result will be the maximum limit
break
result_index_name = re.sub(GoogleParsingConst.html_tag_del_regex, '', result_index_name)
result_index_name = re.sub(GoogleParsingConst.html_entity_del_regex, '', result_index_name)
result_index_url = re.sub(GoogleParsingConst.html_tag_del_regex, '', result_index_url)
result_index_url = re.sub(GoogleParsingConst.html_entity_del_regex, '', result_index_url)
result_index_url = (lambda data: data.replace("›", '/').replace(' ', ''))(result_index_url) # Google splits its url as their own way,
# "/" to "›" (U+203a) (* NOt ">")
result['index'][_seq] = {
'index_name' : result_index_name,
'index_url' : result_index_url
}
_seq += 1
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment