Skip to content

Instantly share code, notes, and snippets.

@svenhofstede-zz
Last active December 19, 2021 12:38
Show Gist options
  • Save svenhofstede-zz/fae68e46330f4215e79b9e960d60278c to your computer and use it in GitHub Desktop.
Save svenhofstede-zz/fae68e46330f4215e79b9e960d60278c to your computer and use it in GitHub Desktop.
# requirements: install python packages
# importing all required packages, pandas, numpy, matplotlib, seaborn, requests,...
import pandas as pd
import numpy as np
import requests
import seaborn as sb
import matplotlib.pyplot as plt
import os
import scrapy
from bs4 import BeautifulSoup
# importing data file CSV format, rename csv
current_dir_path = os.path.dirname(os.path.realpath(__file__))
csv = pd.read_csv(os.path.join(current_dir_path, 'JobVacanciesPerSector.csv'))
# importing data using webscraping methode
home_page = requests.get(url='https://www.jobs.ie').text
soup = BeautifulSoup(home_page, 'html.parser')
all_categories = soup.find_all("section", {"class": "jobs-by-category accordion"})[0].find_all("a")
category_names = list()
category_counts = list()
for link in all_categories:
category_text = link.next
category_count = link.find_all("span")[0].next
print(category_text + " " + category_count)
category_names.append(category_text)
category_counts.append(category_count)
output_dict = {}
output_dict["category_text"] = category_names
output_dict["category_count"] = category_counts
scraped_df = pd.DataFrame.from_dict(output_dict)
# data discovery, column names? number of columns and rows, missing values, duplicate data, primary key?
# pd.set_option('display.max_rows', 50)
# data type
# print(type(csv))
# CSV headers
# print(csv.keys())
# shape of csv
# print(csv.shape)
#
# data cleaning
# print(csv.value_counts())
# print(csv.duplicated().value_counts())
# print('There are ' + str(len(csv)-len(csv.drop_duplicates())) + ' duplicate rows in the data set')
# print('There are ' + str(csv.isna) + ' missing values in the data set')
#
# data analysis
# df = pd.DataFrame(csv, columns=['Sector', 'Vacancies'])
# print(df)
# data visualisation
# analysis conclusion
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment