Arf9999/Iron_March_Forum_chloropleth_map.R

## Iron_March_Forum_chloropleth_map.R


##--------------------------------------------------------------------------------------------------##
##Mapping neo-nazi forum post volume to world map using the Iron March Forum leaked SQL database    ##
##Leaked database is available here: https://archive.org/details/iron_march_201911                  ##
## All below packages are required. All are available on CRAN.                                      ##
##--------------------------------------------------------------------------------------------------##


##packages--------##
library(tidyverse)

library(rgeolocate) ## free ip2location binary datafile is required: You can get it here https://lite.ip2location.com/database/ip-country (registration is required)
## Once downloaded, decompress and save to R working directory, (or edit the code to point to it)
library (choroplethr)

library (chloroplethrMaps)
##-----------------##

data(country.map, package = "choroplethrMaps") # The country map data

## The Iron March forum leak is in a SQLite format. SQLite doesn't manage dates very clearly. This is a quick function to change sql date to POSIXct.
sql_date <- function(date_field) {
  as.POSIXct(as.numeric(date_field), origin = "1970-01-01")
}

#Get the core members from archive.org csv
members <-
  read_csv("https://archive.org/download/iron_march_201911/csv/core_members.csv")

members <- members %>%
  mutate(no = row_number())##create index for joins
##-- Get IP geolocation of members --##
geolocation <-
  rgeolocate::ip2location(members$ip_address, file = "IP2LOCATION-LITE-DB1.BIN", c("country_code", "country_name")) %>%
  mutate(no = row_number())##create index for joins
##------------------------##

members <-
  left_join(members, geolocation, by = "no") ##Join to geolocation to members df
members <-
  members[, c(1:6, 96, 97, 7:75)] #re-order dataframe columns for easier reading


##Get forum messages from archive.org csv

messages <-
  read_csv("https://archive.org/download/iron_march_201911/csv/core_message_posts.csv") %>%
  mutate(join_no = row_number()) %>%
  mutate(member_id = msg_author_id) ##create index for joins to members df

message_geolocation <-
  rgeolocate::ip2location(o_members$ip_address,
                          file = "IP2LOCATION-LITE-DB1.BIN",
                          c("country_code", "country_name")) %>%
  mutate(join_no = row_number()) %>% ##create index for joins
  rename(msg_country_code = country_code, msg_country_name = country_name) ##adjust column names for join


messages <-
  left_join(messages, message_geolocation, by = "join_no") %>% ## join geolocation to messages
  left_join(members[, c(1:8)], by = "member_id")## join file for member details


#convert sql dates - this isn't really required for the mapping - but makes reading the data using View() more pleasant
messages$joined <- sql_date(messages$joined)
messages$msg_date <- sql_date(messages$msg_date)

## cleanup missing data -
messages$msg_country_code <-
  coalesce(messages$msg_country_code, messages$country_code) %>% ## replace message failed IP location with member IP location
  coalesce("US") ## where there is no IP location, convert to US (NB: If the no of NAs is very large, may be better to simply filter those rows out )

messages$country_code <-
  coalesce(messages$country_code, "US") ## ## where there is no IP location, convert to US (NB: If the no of NAs is very large, may be better to simply filter those rows out )


message_mapping <- messages %>%
  mutate(iso_a2 = country_code) %>%  ##iso_a2 is 2 letter iso country name
  mutate(iso_a2 = recode(
    iso_a2,
    ##{choroplethr} data for country.map seems to be missing info for any island nations.
    "SG" = "MY",
    ##Ugly hack to avoid failed mapping by allocating to closest country.
    "PR" = "US",
    "MT" = "IT",
    "PF" = "NZ"
  )) %>%
  group_by(iso_a2) %>%
  mutate(value = n()) %>% #Count no of posts per region
  ungroup()


message_mapping <-
  message_mapping[!duplicated(message_mapping["iso_a2"]), ] #remove duplicates - make df smaller.


message_mapping <-
  left_join(message_mapping, country.map[, c("iso_a2", "region")], by = "iso_a2")# join to country map to get choropleth regions
message_mapping <-
  message_mapping[!duplicated(message_mapping["iso_a2"]), ]#remove duplicate regions - choroplethr::country.map() requires unique entries for regions variable


##Create a world map of the count of forum posts
country_choropleth(message_mapping,
                   title = "IP Geolocation of messages on Iron March forums 2011 - 2017",
                   legend  = "No. of forum posts")

##-------------------------------------------------------------------------------------------------------------------------------------#
##rinse and repeat the above, but rather than allocating messages to msg IP, allocating to member IP (i.e. the IP recorded when joining)
message_mapping2 <- messages %>%
  mutate(iso_a2 = country_code) %>%
  mutate(iso_a2 = recode(
    iso_a2,
    "SG" = "MY",
    "PR" = "US",
    "MT" = "IT",
    "PF" = "NZ"
  )) %>%
  group_by(iso_a2) %>%
  mutate(value = n()) %>%
  ungroup()

message_mapping2 <-
  message_mapping2[!duplicated(message_mapping2["iso_a2"]), ]

message_mapping2 <-
  left_join(message_mapping2, country.map[, c("iso_a2", "region")], by = "iso_a2")
message_mapping2 <-
  message_mapping2[!duplicated(message_mapping2["iso_a2"]), ]

country_choropleth(message_mapping2,
                   title = "IP Geolocation forum users and volume of messages on Iron March forums 2011 - 2017",
                   legend  = "No. of forum posts")
##---------------------------------------------------------------------------------------------------------------------------------------#


	##--------------------------------------------------------------------------------------------------##
	##Mapping neo-nazi forum post volume to world map using the Iron March Forum leaked SQL database ##
	##Leaked database is available here: https://archive.org/details/iron_march_201911 ##
	## All below packages are required. All are available on CRAN. ##
	##--------------------------------------------------------------------------------------------------##


	##packages--------##
	library(tidyverse)

	library(rgeolocate) ## free ip2location binary datafile is required: You can get it here https://lite.ip2location.com/database/ip-country (registration is required)
	## Once downloaded, decompress and save to R working directory, (or edit the code to point to it)
	library (choroplethr)

	library (chloroplethrMaps)
	##-----------------##

	data(country.map, package = "choroplethrMaps") # The country map data

	## The Iron March forum leak is in a SQLite format. SQLite doesn't manage dates very clearly. This is a quick function to change sql date to POSIXct.
	sql_date <- function(date_field) {
	as.POSIXct(as.numeric(date_field), origin = "1970-01-01")
	}

	#Get the core members from archive.org csv
	members <-
	read_csv("https://archive.org/download/iron_march_201911/csv/core_members.csv")

	members <- members %>%
	mutate(no = row_number())##create index for joins
	##-- Get IP geolocation of members --##
	geolocation <-
	rgeolocate::ip2location(members$ip_address, file = "IP2LOCATION-LITE-DB1.BIN", c("country_code", "country_name")) %>%
	mutate(no = row_number())##create index for joins
	##------------------------##

	members <-
	left_join(members, geolocation, by = "no") ##Join to geolocation to members df
	members <-
	members[, c(1:6, 96, 97, 7:75)] #re-order dataframe columns for easier reading


	##Get forum messages from archive.org csv

	messages <-
	read_csv("https://archive.org/download/iron_march_201911/csv/core_message_posts.csv") %>%
	mutate(join_no = row_number()) %>%
	mutate(member_id = msg_author_id) ##create index for joins to members df

	message_geolocation <-
	rgeolocate::ip2location(o_members$ip_address,
	file = "IP2LOCATION-LITE-DB1.BIN",
	c("country_code", "country_name")) %>%
	mutate(join_no = row_number()) %>% ##create index for joins
	rename(msg_country_code = country_code, msg_country_name = country_name) ##adjust column names for join


	messages <-
	left_join(messages, message_geolocation, by = "join_no") %>% ## join geolocation to messages
	left_join(members[, c(1:8)], by = "member_id")## join file for member details


	#convert sql dates - this isn't really required for the mapping - but makes reading the data using View() more pleasant
	messages$joined <- sql_date(messages$joined)
	messages$msg_date <- sql_date(messages$msg_date)

	## cleanup missing data -
	messages$msg_country_code <-
	coalesce(messages$msg_country_code, messages$country_code) %>% ## replace message failed IP location with member IP location
	coalesce("US") ## where there is no IP location, convert to US (NB: If the no of NAs is very large, may be better to simply filter those rows out )

	messages$country_code <-
	coalesce(messages$country_code, "US") ## ## where there is no IP location, convert to US (NB: If the no of NAs is very large, may be better to simply filter those rows out )



	message_mapping <- messages %>%
	mutate(iso_a2 = country_code) %>% ##iso_a2 is 2 letter iso country name
	mutate(iso_a2 = recode(
	iso_a2,
	##{choroplethr} data for country.map seems to be missing info for any island nations.
	"SG" = "MY",
	##Ugly hack to avoid failed mapping by allocating to closest country.
	"PR" = "US",
	"MT" = "IT",
	"PF" = "NZ"
	)) %>%
	group_by(iso_a2) %>%
	mutate(value = n()) %>% #Count no of posts per region
	ungroup()


	message_mapping <-
	message_mapping[!duplicated(message_mapping["iso_a2"]), ] #remove duplicates - make df smaller.



	message_mapping <-
	left_join(message_mapping, country.map[, c("iso_a2", "region")], by = "iso_a2")# join to country map to get choropleth regions
	message_mapping <-
	message_mapping[!duplicated(message_mapping["iso_a2"]), ]#remove duplicate regions - choroplethr::country.map() requires unique entries for regions variable


	##Create a world map of the count of forum posts
	country_choropleth(message_mapping,
	title = "IP Geolocation of messages on Iron March forums 2011 - 2017",
	legend = "No. of forum posts")

	##-------------------------------------------------------------------------------------------------------------------------------------#
	##rinse and repeat the above, but rather than allocating messages to msg IP, allocating to member IP (i.e. the IP recorded when joining)
	message_mapping2 <- messages %>%
	mutate(iso_a2 = country_code) %>%
	mutate(iso_a2 = recode(
	iso_a2,
	"SG" = "MY",
	"PR" = "US",
	"MT" = "IT",
	"PF" = "NZ"
	)) %>%
	group_by(iso_a2) %>%
	mutate(value = n()) %>%
	ungroup()

	message_mapping2 <-
	message_mapping2[!duplicated(message_mapping2["iso_a2"]), ]

	message_mapping2 <-
	left_join(message_mapping2, country.map[, c("iso_a2", "region")], by = "iso_a2")
	message_mapping2 <-
	message_mapping2[!duplicated(message_mapping2["iso_a2"]), ]

	country_choropleth(message_mapping2,
	title = "IP Geolocation forum users and volume of messages on Iron March forums 2011 - 2017",
	legend = "No. of forum posts")
	##---------------------------------------------------------------------------------------------------------------------------------------#