sentientmachine/coinmarketcap_extract.py

## coinmarketcap_extract.py
#!/usr/bin/python3
# coding: utf-8

import requests
from bs4 import BeautifulSoup
from scrapy import Selector
import csv
import datetime

#pip3 install --user bs4
#pip3 install --user scrapy

def extract(url):
    print("Export all cryptodata from cryptomarketcap.com")
    """
    USAGE:
    Arguments:
         url (str):
            url of the aimed Coinmarketcap page
    Returns:
        .csv file
    """

    # Initialization
    r = requests.session()
    start = datetime.datetime.now()

    #retry if site is inaccessible
    for retry in range(10):
        response = r.get(url=url)

        print("response is: ")
        print(response.headers)
        print("-- STATUS CODE --")
        print(response.status_code)

        #I gave up on maintaining this code since screen-scraping is against the terms of coinmarketcap.com
        #and they work very hard to prevent that.  Sustainable solution was for me to make an account with
        #coinbase.com and use the developer's API key and there's some python code to request ohlcv daily prices

        #The requets, scrapy and BeautifulSoup isn't strong enough for this job.
        #They obfuscate the html every few months anyway, to throw you off.
        print("now do parsing")
        if response.status_code == 200:
            print("response code is good")

            #with open("/path/to/coinmarketcap/cryptocurrencies_{}.csv".format(str(datetime.date.today())), "w") as f:
            with open("/tmp/coinmarketcap/cryptocurrencies_{}.csv".format(str(datetime.date.today())), "w") as f:

                fieldnames = ['Nom', 'Symbole', 'Cap. marche', 'Prix', 'Offre en circulation', 'Volume (24h)', '% 1h', '% 24h', '7 j']
                writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
                writer.writeheader()

                #print("response.text is")

                soup = BeautifulSoup(response.text, features='html.parser')
                sel = Selector(text=soup.prettify())

                #cryptos = sel.xpath("//tr[contains(@id, 'id-')]").extract()

                cryptos = sel.xpath("//tr[contains(@class, 'cmc-table-row')]").extract()
                print("cryptos list: '" + str(cryptos) + "'")


                for crypto in cryptos:
                    soup = BeautifulSoup(crypto, features='html.parser')
                    sel = Selector(text=soup.prettify())
                    print("found a crypto: " + str(crypto))
                    #nom = sel.xpath("//td[contains(@class, 'currency-name')]/@data-sort").extract_first()
                    nom = sel.xpath("//a[contains(@class, 'cmc-table__column-name--name')]/text()").extract_first()
                    symbole = sel.xpath("//td[contains(@class, 'col-symbol')]/text()").extract_first()
                    cap_marche = sel.xpath("//td[contains(@class, 'market-cap')]/text()").extract_first()
                    prix = sel.xpath("//a[@class='price']/@data-usd").extract_first()
                    offre_circulation = sel.xpath("//a[@class='volume']/@data-usd").extract_first()
                    volume = sel.xpath("//td[contains(@class, 'circulating-supply')]/@data-sort").extract_first()
                    percent_1h = sel.xpath("//td[@data-timespan='1h']/@data-sort").extract_first()
                    percent_24h = sel.xpath("//td[@data-timespan='24h']/@data-sort").extract_first()
                    percent_7j = sel.xpath("//td[@data-timespan='7d']/@data-sort").extract_first()

                    clean_values = []
                    values = [nom, symbole, cap_marche, prix, offre_circulation, volume, percent_1h, percent_24h, percent_7j]
                    for value in values:
                        if value:
                            value = value.strip().replace('\n', '')
                        clean_values.append(value)

                    #print(', '.join(clean_values))

                    dict_row = dict(zip(fieldnames, clean_values))
                    writer.writerow(dict_row)

            # amount of time elapsed
            end = datetime.datetime.now()
            time_elapsed = str(end - start)
            print('\n')
            print('-- TIME ELAPSED --')
            print(time_elapsed)
            break

        elif response.status_code == 404:
            print("Page indisponible")
            break

        else:
            print("Can't load page.")
            return []


def main():
    #url = "https://coinmarketcap.com/fr/all/views/all/"
    url = "https://coinmarketcap.com/all/views/all/"
    print("url: '" + str(url) + "'")
    extract(url)


if __name__ == '__main__':
    main()
	#!/usr/bin/python3
	# coding: utf-8

	import requests
	from bs4 import BeautifulSoup
	from scrapy import Selector
	import csv
	import datetime

	#pip3 install --user bs4
	#pip3 install --user scrapy

	def extract(url):
	print("Export all cryptodata from cryptomarketcap.com")
	"""
	USAGE:
	Arguments:
	url (str):
	url of the aimed Coinmarketcap page
	Returns:
	.csv file
	"""

	# Initialization
	r = requests.session()
	start = datetime.datetime.now()

	#retry if site is inaccessible
	for retry in range(10):
	response = r.get(url=url)

	print("response is: ")
	print(response.headers)
	print("-- STATUS CODE --")
	print(response.status_code)

	#I gave up on maintaining this code since screen-scraping is against the terms of coinmarketcap.com
	#and they work very hard to prevent that. Sustainable solution was for me to make an account with
	#coinbase.com and use the developer's API key and there's some python code to request ohlcv daily prices

	#The requets, scrapy and BeautifulSoup isn't strong enough for this job.
	#They obfuscate the html every few months anyway, to throw you off.
	print("now do parsing")
	if response.status_code == 200:
	print("response code is good")

	#with open("/path/to/coinmarketcap/cryptocurrencies_{}.csv".format(str(datetime.date.today())), "w") as f:
	with open("/tmp/coinmarketcap/cryptocurrencies_{}.csv".format(str(datetime.date.today())), "w") as f:

	fieldnames = ['Nom', 'Symbole', 'Cap. marche', 'Prix', 'Offre en circulation', 'Volume (24h)', '% 1h', '% 24h', '7 j']
	writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
	writer.writeheader()

	#print("response.text is")

	soup = BeautifulSoup(response.text, features='html.parser')
	sel = Selector(text=soup.prettify())

	#cryptos = sel.xpath("//tr[contains(@id, 'id-')]").extract()

	cryptos = sel.xpath("//tr[contains(@class, 'cmc-table-row')]").extract()
	print("cryptos list: '" + str(cryptos) + "'")


	for crypto in cryptos:
	soup = BeautifulSoup(crypto, features='html.parser')
	sel = Selector(text=soup.prettify())
	print("found a crypto: " + str(crypto))
	#nom = sel.xpath("//td[contains(@class, 'currency-name')]/@data-sort").extract_first()
	nom = sel.xpath("//a[contains(@class, 'cmc-table__column-name--name')]/text()").extract_first()
	symbole = sel.xpath("//td[contains(@class, 'col-symbol')]/text()").extract_first()
	cap_marche = sel.xpath("//td[contains(@class, 'market-cap')]/text()").extract_first()
	prix = sel.xpath("//a[@class='price']/@data-usd").extract_first()
	offre_circulation = sel.xpath("//a[@class='volume']/@data-usd").extract_first()
	volume = sel.xpath("//td[contains(@class, 'circulating-supply')]/@data-sort").extract_first()
	percent_1h = sel.xpath("//td[@data-timespan='1h']/@data-sort").extract_first()
	percent_24h = sel.xpath("//td[@data-timespan='24h']/@data-sort").extract_first()
	percent_7j = sel.xpath("//td[@data-timespan='7d']/@data-sort").extract_first()

	clean_values = []
	values = [nom, symbole, cap_marche, prix, offre_circulation, volume, percent_1h, percent_24h, percent_7j]
	for value in values:
	if value:
	value = value.strip().replace('\n', '')
	clean_values.append(value)

	#print(', '.join(clean_values))

	dict_row = dict(zip(fieldnames, clean_values))
	writer.writerow(dict_row)

	# amount of time elapsed
	end = datetime.datetime.now()
	time_elapsed = str(end - start)
	print('\n')
	print('-- TIME ELAPSED --')
	print(time_elapsed)
	break

	elif response.status_code == 404:
	print("Page indisponible")
	break

	else:
	print("Can't load page.")
	return []


	def main():
	#url = "https://coinmarketcap.com/fr/all/views/all/"
	url = "https://coinmarketcap.com/all/views/all/"
	print("url: '" + str(url) + "'")
	extract(url)


	if __name__ == '__main__':
	main()