justagist/railpro_download.py

## railpro_download.py
"""
Download all listed companies from raipro.co.uk.

Saves to a single csv/xlsx file with columns
"Company Name", "Address", "Phone", "Website"
"""
import re
import urllib.request
import pandas as pd
import html

rows_list = []
for i in range(1, 500):
    req = urllib.request.Request(
        'https://www.railpro.co.uk/business-directory/page/%d' % i,  headers={'User-Agent': 'Mozilla/5.0'})
    response = urllib.request.urlopen(req).read()
    result = re.findall('<article id=(.*?)</article>', str(response))
    if len(result) == 0:
        print("Done. Total webpages crawled: %d" % (i-1))
        break

    print(i)

    for r in result:
        vals = re.findall('title=\"(.*?)</div> </', r)
        if len(vals) == 0:
            vals = re.findall('title=\"(.*?)\"', r)
        assert (len(vals) == 1), r
        c_name = html.unescape(vals[0].split("\"")[0])
        addr = ""
        if "Post Code" in r:
            addr_val = re.findall('title=\"(.*?)\">Post Code:', r)[0]
            other_vals = re.findall(
                '<span class=\"w2dc-field-content\"> (.*?) </span>', addr_val)
            for v in other_vals:
                addr += v + "\n"
            pc = re.findall(
                'Post Code:</span> </span> <span class=\"w2dc-field-content\"> (.*?) </span></div>', r)[0]
            addr += pc

        ph = ""
        if "Phone:" in r:
            ph = re.findall(
                'Phone:</span> </span> <span class=\"w2dc-field-content\"> (.*?) </span></div>', r)[0]
        website = ""
        if '\"url\" href=' in r:
            website = re.findall('\"url\" href=\"(.*?)\"', r)[0]
        # print (c_name)
        # print (addr)
        # print (ph)
        # print (website)
        # print (" ")
        rows_list.append({
            "Company Name": c_name,
            "Address": addr,
            "Phone": ph,
            "Website": website
        })

df = pd.DataFrame(rows_list)
df.index += 1
# df.to_csv("kuthirappavan.csv")
df.to_excel("cid_escape.xlsx")

## railpro_multi_category_download.py
"""
Download from all pages of railpro.co.uk in different categories
by going through a provided list of corresponding urls.

Saves to a single file with additional column "Category". Merges
entries into one if entity belongs to multiple categories.
"""
import re
import urllib.request
import pandas as pd
import html

# ----- name of output file (.xlsx or .csv, or None for not saving)
output_file = "installment_fellowship.xlsx"

# ----- example category list
category_list = {
    "category1": "https://www.railpro.co.uk/business-directory/business-category/infrastructure/welding-products-services/",
    "category2": "https://www.railpro.co.uk/business-directory/business-category/infrastructure/engineering-tools-equipment/"
}

company_dict = {}
page_count = 0
num_comps = 0
for category in category_list:

    url = category_list[category]
    if url[-1] != '/':
        url+="/"

    print("\nScraping railpro pages under category: '%s'" % category)
    for i in range(1, 350):  # ----- max number of pages to scrape for each category
        req = urllib.request.Request(
            '%spage/%d' % (url, i),  headers={'User-Agent': 'Mozilla/5.0'})
        response = urllib.request.urlopen(req).read()
        result = re.findall('<article id=(.*?)</article>', str(response))
        if len(result) == 0:
            print("\t'%s' Done. Pages scraped: %d" % (category, i-1))
            break

        print("\tpg: ", i)
        page_count += 1

        for r in result:
            num_comps += 1
            vals = re.findall('title=\"(.*?)</div> </', r)
            if len(vals) == 0:
                vals = re.findall('title=\"(.*?)\"', r)
            assert (len(vals) == 1), r
            c_name = html.unescape(vals[0].split("\"")[0])

            if c_name in company_dict:
                # print("\t\tCompany '%s' already exists under category(s): '%s'. Adding new category: '%s'" % (
                #     c_name, company_dict[c_name]["Category"], category))
                print("\t\t[INFO]: Modifying category name for %s" % c_name)
                company_dict[c_name]["Category"] += ", %s" % category
                continue

            addr = ""
            if "Post Code" in r:
                addr_val = re.findall('title=\"(.*?)\">Post Code:', r)[0]
                other_vals = re.findall(
                    '<span class=\"w2dc-field-content\"> (.*?) </span>', addr_val)
                for v in other_vals:
                    addr += v + "\n"
                pc = re.findall(
                    'Post Code:</span> </span> <span class=\"w2dc-field-content\"> (.*?) </span></div>', r)[0]
                addr += pc

            ph = ""
            if "Phone:" in r:
                ph = re.findall(
                    'Phone:</span> </span> <span class=\"w2dc-field-content\"> (.*?) </span></div>', r)[0]
            website = ""
            if '\"url\" href=' in r:
                website = re.findall('\"url\" href=\"(.*?)\"', r)[0]

            company_dict[c_name] = {
                "Company Name": c_name,
                "Address": addr,
                "Phone": ph,
                "Website": website,
                "Category": category
            }
    else:
        print("\n[Warning]: Stopped scraping after reaching the pre-set range limit of pages. The output list may not be exhaustive!\n")

rows_list = []
for c in company_dict:
    rows_list.append(company_dict[c])

df = pd.DataFrame(rows_list)
df.index += 1

print("\nCompleted!")
print("Total pages scraped: %d; Total companies found: %d; \nFinal number of companies after merge: %d" %
      (page_count, num_comps, df.index[-1]))

if output_file is not None:
    if output_file.split(".")[-1] == "csv":
        df.to_csv(output_file)
    elif output_file.split(".")[-1] == "xlsx":
        df.to_excel(output_file)
    else:
        raise ValueError(
            "Invalid file type for output file. Use csv or xlsx extension")
	"""
	Download all listed companies from raipro.co.uk.

	Saves to a single csv/xlsx file with columns
	"Company Name", "Address", "Phone", "Website"
	"""
	import re
	import urllib.request
	import pandas as pd
	import html

	rows_list = []
	for i in range(1, 500):
	req = urllib.request.Request(
	'https://www.railpro.co.uk/business-directory/page/%d' % i, headers={'User-Agent': 'Mozilla/5.0'})
	response = urllib.request.urlopen(req).read()
	result = re.findall('<article id=(.*?)</article>', str(response))
	if len(result) == 0:
	print("Done. Total webpages crawled: %d" % (i-1))
	break

	print(i)

	for r in result:
	vals = re.findall('title=\"(.*?)</div> </', r)
	if len(vals) == 0:
	vals = re.findall('title=\"(.*?)\"', r)
	assert (len(vals) == 1), r
	c_name = html.unescape(vals[0].split("\"")[0])
	addr = ""
	if "Post Code" in r:
	addr_val = re.findall('title=\"(.*?)\">Post Code:', r)[0]
	other_vals = re.findall(
	'<span class=\"w2dc-field-content\"> (.*?) </span>', addr_val)
	for v in other_vals:
	addr += v + "\n"
	pc = re.findall(
	'Post Code:</span> </span> <span class=\"w2dc-field-content\"> (.*?) </span></div>', r)[0]
	addr += pc

	ph = ""
	if "Phone:" in r:
	ph = re.findall(
	'Phone:</span> </span> <span class=\"w2dc-field-content\"> (.*?) </span></div>', r)[0]
	website = ""
	if '\"url\" href=' in r:
	website = re.findall('\"url\" href=\"(.*?)\"', r)[0]
	# print (c_name)
	# print (addr)
	# print (ph)
	# print (website)
	# print (" ")
	rows_list.append({
	"Company Name": c_name,
	"Address": addr,
	"Phone": ph,
	"Website": website
	})

	df = pd.DataFrame(rows_list)
	df.index += 1
	# df.to_csv("kuthirappavan.csv")
	df.to_excel("cid_escape.xlsx")
	"""
	Download from all pages of railpro.co.uk in different categories
	by going through a provided list of corresponding urls.

	Saves to a single file with additional column "Category". Merges
	entries into one if entity belongs to multiple categories.
	"""
	import re
	import urllib.request
	import pandas as pd
	import html

	# ----- name of output file (.xlsx or .csv, or None for not saving)
	output_file = "installment_fellowship.xlsx"

	# ----- example category list
	category_list = {
	"category1": "https://www.railpro.co.uk/business-directory/business-category/infrastructure/welding-products-services/",
	"category2": "https://www.railpro.co.uk/business-directory/business-category/infrastructure/engineering-tools-equipment/"
	}

	company_dict = {}
	page_count = 0
	num_comps = 0
	for category in category_list:

	url = category_list[category]
	if url[-1] != '/':
	url+="/"

	print("\nScraping railpro pages under category: '%s'" % category)
	for i in range(1, 350): # ----- max number of pages to scrape for each category
	req = urllib.request.Request(
	'%spage/%d' % (url, i), headers={'User-Agent': 'Mozilla/5.0'})
	response = urllib.request.urlopen(req).read()
	result = re.findall('<article id=(.*?)</article>', str(response))
	if len(result) == 0:
	print("\t'%s' Done. Pages scraped: %d" % (category, i-1))
	break

	print("\tpg: ", i)
	page_count += 1

	for r in result:
	num_comps += 1
	vals = re.findall('title=\"(.*?)</div> </', r)
	if len(vals) == 0:
	vals = re.findall('title=\"(.*?)\"', r)
	assert (len(vals) == 1), r
	c_name = html.unescape(vals[0].split("\"")[0])

	if c_name in company_dict:
	# print("\t\tCompany '%s' already exists under category(s): '%s'. Adding new category: '%s'" % (
	# c_name, company_dict[c_name]["Category"], category))
	print("\t\t[INFO]: Modifying category name for %s" % c_name)
	company_dict[c_name]["Category"] += ", %s" % category
	continue

	addr = ""
	if "Post Code" in r:
	addr_val = re.findall('title=\"(.*?)\">Post Code:', r)[0]
	other_vals = re.findall(
	'<span class=\"w2dc-field-content\"> (.*?) </span>', addr_val)
	for v in other_vals:
	addr += v + "\n"
	pc = re.findall(
	'Post Code:</span> </span> <span class=\"w2dc-field-content\"> (.*?) </span></div>', r)[0]
	addr += pc

	ph = ""
	if "Phone:" in r:
	ph = re.findall(
	'Phone:</span> </span> <span class=\"w2dc-field-content\"> (.*?) </span></div>', r)[0]
	website = ""
	if '\"url\" href=' in r:
	website = re.findall('\"url\" href=\"(.*?)\"', r)[0]

	company_dict[c_name] = {
	"Company Name": c_name,
	"Address": addr,
	"Phone": ph,
	"Website": website,
	"Category": category
	}
	else:
	print("\n[Warning]: Stopped scraping after reaching the pre-set range limit of pages. The output list may not be exhaustive!\n")

	rows_list = []
	for c in company_dict:
	rows_list.append(company_dict[c])

	df = pd.DataFrame(rows_list)
	df.index += 1

	print("\nCompleted!")
	print("Total pages scraped: %d; Total companies found: %d; \nFinal number of companies after merge: %d" %
	(page_count, num_comps, df.index[-1]))

	if output_file is not None:
	if output_file.split(".")[-1] == "csv":
	df.to_csv(output_file)
	elif output_file.split(".")[-1] == "xlsx":
	df.to_excel(output_file)
	else:
	raise ValueError(
	"Invalid file type for output file. Use csv or xlsx extension")