Last active
December 17, 2015 14:28
-
-
Save petitviolet/5624321 to your computer and use it in GitHub Desktop.
search.pyで利用する、YahooApiから返って来たxmlから検索結果1件ごとのタイトルとurlとサマリーを抽出する。
タイトルとurlは改行と空白文字を許さず、サマリーはそのまま取得する。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding:utf-8 -*- | |
import re | |
result_pattern = '<Result>(.+?)</Result>' | |
link_pattern = '<Url>(.*?)</Url>' | |
title_pattern = '<Title>(.*?)</Title>' | |
summary_pattern = '<Summary>(.*?)</Summary>' | |
strip_pattern = r'\\n|\s' | |
catch_result = re.compile(result_pattern, re.S) | |
catch_link = re.compile(link_pattern, re.S) | |
catch_title = re.compile(title_pattern, re.S) | |
catch_summary = re.compile(summary_pattern, re.S) | |
stripper = re.compile(strip_pattern, re.S) | |
def split_result(html): | |
'''YahooApi叩いて返って来たxmlから | |
[title, link, summary]を抽出して返す | |
''' | |
results = catch_result.finditer(html) | |
splited_result = [] | |
for result in results: | |
if result: | |
result = result.group(0) | |
else: | |
continue | |
try: | |
title = catch_title.search(result).group(1) | |
title = stripper.sub('', title) | |
# title = title.replace('\n', '').replace(' ', '').strip() | |
except AttributeError: | |
title = 'タイトルなかったよ' | |
try: | |
link = catch_link.search(result).group(1) | |
link = stripper.sub('', link) | |
# link = link.replace('\n', '').replace(' ', '').strip() | |
except AttributeError: | |
link = 'urlなかったよ' | |
try: | |
summary = catch_summary.search(result).group(1) | |
except AttributeError: | |
summary = 'スニペットなかったよ' | |
splited_result.append([title, link, summary]) | |
return splited_result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment