Last active
February 19, 2022 17:51
-
-
Save aphi/b68b1fd585db931e455e7285c7702166 to your computer and use it in GitHub Desktop.
Extract data from Investing.com for use within investpy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Extract data from Investing.com for use within investpy | |
Output is comma-separated and can be copied directly into stocks.csv to add missing stocks | |
$ python search_investing_urls.py | |
country,name,full_name,tag,isin,id,currency,symbol | |
united states,Airbnb,Airbnb,airbnb-inc,US0090661010,1167744,USD,ABNB | |
united kingdom,Deliveroo Holdings,Deliveroo Holdings PLC,deliveroo-holdings,GB00BNC5T391,1172028,GBP,ROO | |
united states,Nio A ADR,Nio Inc Class A ADR,nio-inc,US62914V1061,1096032,USD,NIO | |
""" | |
import re | |
from collections import Counter, OrderedDict | |
from urllib.request import Request, urlopen | |
# Enter URL list here | |
URLS = [ | |
'https://www.investing.com/equities/airbnb-inc', | |
'https://www.investing.com/equities/deliveroo-holdings', | |
'https://www.investing.com/equities/nio-inc' | |
] | |
def select_match(matches): | |
""" Most frequent element, or first in case of tie """ | |
if not matches: | |
return '' | |
return max(matches, key=Counter(matches).get) | |
regexs = OrderedDict( | |
country=r'market\\"\:\{\\"name\\"\:\\"([a-zA-Z0-9- ]*)', | |
name=r'name\\\\\\"\:\{\\\\\\"shortName\\\\\\":\\\\\\"([a-zA-Z0-9- ]*)', | |
full_name=r'underlyingName\\\\\\"\:\\\\\\"([a-zA-Z0-9- ]*)', | |
tag=r'query"\:\{"equity"\:\["([a-zA-Z0-9- ]*)', | |
isin=r'isin\\\\\\":\\\\\\"([a-zA-Z0-9- ]*)', | |
id=r'instrument_id\\"\:\\"([a-zA-Z0-9- ]*)', | |
currency=r'currency\\\\\\":\\\\\\"([a-zA-Z0-9- ]*)', | |
symbol=r'SectionInstrument_Ticker\\"\:\\"([a-zA-Z0-9- ]*)', | |
) | |
print('country,name,full_name,tag,isin,id,currency,symbol') | |
for url in URLS: | |
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) | |
page_text = urlopen(req).read().decode('utf-8') | |
row = [] | |
for search_field, regex in regexs.items(): | |
pattern = re.compile(regex) | |
matches = pattern.findall(page_text) | |
match = select_match(matches) | |
if search_field=='country': # special case | |
match = match.lower() | |
row.append(match) | |
print(','.join(row)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage: