Vesihiisi/update-sources.bash

## update-sources.bash
#!/bin/bash
#
# Little reference to API
# --------------------------------
# Use http://api.arbetsformedlingen.se/platsannons/soklista/yrkesomraden
# to fetch a list of all available profession areas.
# The one that best fits our needs is area 3, "data/IT"
#
# http://api.arbetsformedlingen.se/platsannons/soklista/yrkesgrupper?yrkesomradeid=3
# This will fetch number of available jobs for each profession group within this
# profession area
#
# http://api.arbetsformedlingen.se/platsannons/soklista/yrken?yrkesgruppid=3515
# This will fetch number of available jobs for each profession within this
# profession group.


# In order to minimize how many times we call on the API, we start off
# by fetching all the ads within relevant profession area.
# Max limit of rows is 10 000, and since the number of results doesn't seem to be larger
# than 3000, that's not a problem.
#

echo "Will now fetch list of all relevant ads. This may take some time."


curl \
    --silent   \
    --header "Accept: application/json" \
    --header "Accept-Language: sv" \
    --header "From: alkw15@student.bth.se" \
    "http://api.arbetsformedlingen.se/platsannons/matchning?yrkesomradeid=3&antalrader=10000" | jq '.' > "ads_list.json"

echo "List of all relevant ads fetched."
echo -n "Number of ads: "
jq '.matchningslista.antal_platsannonser' ads_list.json
echo "Will now download all these ads. This may take some time."

# Working list of ids, -r to avoid quotation marks
# This will be used to fetch all the actual ads
#
jq -r '.matchningslista.matchningdata[].annonsid' ads_list.json > "all_ids.txt"

# Remove all existing ads
rm -r ads/*

cat "all_ids.txt" | while read ad_id
do
    # For whatever reason, some requests take a lot of time...
    # As of now, we watch the output manually and restart script
    # if it seems to take too much time.
    # Setting --max-time doesn't make sense since we then miss some files.
    # Remove --silent to have something to look at while querying.
    nameoffile="ads/$ad_id.json"
    echo "Downloading and saving ad $ad_id as file $nameoffile"
    curl \
    --header "Accept: application/json" \
    --header "Accept-Language: sv" \
    --header "From: alkw15@student.bth.se" \
    --silent   \
    "http://api.arbetsformedlingen.se/platsannons/$ad_id" | jq '.' > $nameoffile
done
	#!/bin/bash
	#
	# Little reference to API
	# --------------------------------
	# Use http://api.arbetsformedlingen.se/platsannons/soklista/yrkesomraden
	# to fetch a list of all available profession areas.
	# The one that best fits our needs is area 3, "data/IT"
	#
	# http://api.arbetsformedlingen.se/platsannons/soklista/yrkesgrupper?yrkesomradeid=3
	# This will fetch number of available jobs for each profession group within this
	# profession area
	#
	# http://api.arbetsformedlingen.se/platsannons/soklista/yrken?yrkesgruppid=3515
	# This will fetch number of available jobs for each profession within this
	# profession group.


	# In order to minimize how many times we call on the API, we start off
	# by fetching all the ads within relevant profession area.
	# Max limit of rows is 10 000, and since the number of results doesn't seem to be larger
	# than 3000, that's not a problem.
	#

	echo "Will now fetch list of all relevant ads. This may take some time."


	curl \
	--silent \
	--header "Accept: application/json" \
	--header "Accept-Language: sv" \
	--header "From: alkw15@student.bth.se" \
	"http://api.arbetsformedlingen.se/platsannons/matchning?yrkesomradeid=3&antalrader=10000" \| jq '.' > "ads_list.json"

	echo "List of all relevant ads fetched."
	echo -n "Number of ads: "
	jq '.matchningslista.antal_platsannonser' ads_list.json
	echo "Will now download all these ads. This may take some time."

	# Working list of ids, -r to avoid quotation marks
	# This will be used to fetch all the actual ads
	#
	jq -r '.matchningslista.matchningdata[].annonsid' ads_list.json > "all_ids.txt"

	# Remove all existing ads
	rm -r ads/*

	cat "all_ids.txt" \| while read ad_id
	do
	# For whatever reason, some requests take a lot of time...
	# As of now, we watch the output manually and restart script
	# if it seems to take too much time.
	# Setting --max-time doesn't make sense since we then miss some files.
	# Remove --silent to have something to look at while querying.
	nameoffile="ads/$ad_id.json"
	echo "Downloading and saving ad $ad_id as file $nameoffile"
	curl \
	--header "Accept: application/json" \
	--header "Accept-Language: sv" \
	--header "From: alkw15@student.bth.se" \
	--silent \
	"http://api.arbetsformedlingen.se/platsannons/$ad_id" \| jq '.' > $nameoffile
	done