Skip to content

Instantly share code, notes, and snippets.

@Arf9999
Created November 8, 2019 09:05
Show Gist options
  • Save Arf9999/cb22c37caa75c0ca1b8d4298c0a22c56 to your computer and use it in GitHub Desktop.
Save Arf9999/cb22c37caa75c0ca1b8d4298c0a22c56 to your computer and use it in GitHub Desktop.
##--------------------------------------------------------------------------------------------------##
##Mapping neo-nazi forum post volume to world map using the Iron March Forum leaked SQL database ##
##Leaked database is available here: https://archive.org/details/iron_march_201911 ##
## All below packages are required. All are available on CRAN. ##
##--------------------------------------------------------------------------------------------------##
##packages--------##
library(tidyverse)
library(rgeolocate) ## free ip2location binary datafile is required: You can get it here https://lite.ip2location.com/database/ip-country (registration is required)
## Once downloaded, decompress and save to R working directory, (or edit the code to point to it)
library (choroplethr)
library (chloroplethrMaps)
##-----------------##
data(country.map, package = "choroplethrMaps") # The country map data
## The Iron March forum leak is in a SQLite format. SQLite doesn't manage dates very clearly. This is a quick function to change sql date to POSIXct.
sql_date <- function(date_field) {
as.POSIXct(as.numeric(date_field), origin = "1970-01-01")
}
#Get the core members from archive.org csv
members <-
read_csv("https://archive.org/download/iron_march_201911/csv/core_members.csv")
members <- members %>%
mutate(no = row_number())##create index for joins
##-- Get IP geolocation of members --##
geolocation <-
rgeolocate::ip2location(members$ip_address, file = "IP2LOCATION-LITE-DB1.BIN", c("country_code", "country_name")) %>%
mutate(no = row_number())##create index for joins
##------------------------##
members <-
left_join(members, geolocation, by = "no") ##Join to geolocation to members df
members <-
members[, c(1:6, 96, 97, 7:75)] #re-order dataframe columns for easier reading
##Get forum messages from archive.org csv
messages <-
read_csv("https://archive.org/download/iron_march_201911/csv/core_message_posts.csv") %>%
mutate(join_no = row_number()) %>%
mutate(member_id = msg_author_id) ##create index for joins to members df
message_geolocation <-
rgeolocate::ip2location(o_members$ip_address,
file = "IP2LOCATION-LITE-DB1.BIN",
c("country_code", "country_name")) %>%
mutate(join_no = row_number()) %>% ##create index for joins
rename(msg_country_code = country_code, msg_country_name = country_name) ##adjust column names for join
messages <-
left_join(messages, message_geolocation, by = "join_no") %>% ## join geolocation to messages
left_join(members[, c(1:8)], by = "member_id")## join file for member details
#convert sql dates - this isn't really required for the mapping - but makes reading the data using View() more pleasant
messages$joined <- sql_date(messages$joined)
messages$msg_date <- sql_date(messages$msg_date)
## cleanup missing data -
messages$msg_country_code <-
coalesce(messages$msg_country_code, messages$country_code) %>% ## replace message failed IP location with member IP location
coalesce("US") ## where there is no IP location, convert to US (NB: If the no of NAs is very large, may be better to simply filter those rows out )
messages$country_code <-
coalesce(messages$country_code, "US") ## ## where there is no IP location, convert to US (NB: If the no of NAs is very large, may be better to simply filter those rows out )
message_mapping <- messages %>%
mutate(iso_a2 = country_code) %>% ##iso_a2 is 2 letter iso country name
mutate(iso_a2 = recode(
iso_a2,
##{choroplethr} data for country.map seems to be missing info for any island nations.
"SG" = "MY",
##Ugly hack to avoid failed mapping by allocating to closest country.
"PR" = "US",
"MT" = "IT",
"PF" = "NZ"
)) %>%
group_by(iso_a2) %>%
mutate(value = n()) %>% #Count no of posts per region
ungroup()
message_mapping <-
message_mapping[!duplicated(message_mapping["iso_a2"]), ] #remove duplicates - make df smaller.
message_mapping <-
left_join(message_mapping, country.map[, c("iso_a2", "region")], by = "iso_a2")# join to country map to get choropleth regions
message_mapping <-
message_mapping[!duplicated(message_mapping["iso_a2"]), ]#remove duplicate regions - choroplethr::country.map() requires unique entries for regions variable
##Create a world map of the count of forum posts
country_choropleth(message_mapping,
title = "IP Geolocation of messages on Iron March forums 2011 - 2017",
legend = "No. of forum posts")
##-------------------------------------------------------------------------------------------------------------------------------------#
##rinse and repeat the above, but rather than allocating messages to msg IP, allocating to member IP (i.e. the IP recorded when joining)
message_mapping2 <- messages %>%
mutate(iso_a2 = country_code) %>%
mutate(iso_a2 = recode(
iso_a2,
"SG" = "MY",
"PR" = "US",
"MT" = "IT",
"PF" = "NZ"
)) %>%
group_by(iso_a2) %>%
mutate(value = n()) %>%
ungroup()
message_mapping2 <-
message_mapping2[!duplicated(message_mapping2["iso_a2"]), ]
message_mapping2 <-
left_join(message_mapping2, country.map[, c("iso_a2", "region")], by = "iso_a2")
message_mapping2 <-
message_mapping2[!duplicated(message_mapping2["iso_a2"]), ]
country_choropleth(message_mapping2,
title = "IP Geolocation forum users and volume of messages on Iron March forums 2011 - 2017",
legend = "No. of forum posts")
##---------------------------------------------------------------------------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment