Created
November 8, 2019 09:05
-
-
Save Arf9999/cb22c37caa75c0ca1b8d4298c0a22c56 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##--------------------------------------------------------------------------------------------------## | |
##Mapping neo-nazi forum post volume to world map using the Iron March Forum leaked SQL database ## | |
##Leaked database is available here: https://archive.org/details/iron_march_201911 ## | |
## All below packages are required. All are available on CRAN. ## | |
##--------------------------------------------------------------------------------------------------## | |
##packages--------## | |
library(tidyverse) | |
library(rgeolocate) ## free ip2location binary datafile is required: You can get it here https://lite.ip2location.com/database/ip-country (registration is required) | |
## Once downloaded, decompress and save to R working directory, (or edit the code to point to it) | |
library (choroplethr) | |
library (chloroplethrMaps) | |
##-----------------## | |
data(country.map, package = "choroplethrMaps") # The country map data | |
## The Iron March forum leak is in a SQLite format. SQLite doesn't manage dates very clearly. This is a quick function to change sql date to POSIXct. | |
sql_date <- function(date_field) { | |
as.POSIXct(as.numeric(date_field), origin = "1970-01-01") | |
} | |
#Get the core members from archive.org csv | |
members <- | |
read_csv("https://archive.org/download/iron_march_201911/csv/core_members.csv") | |
members <- members %>% | |
mutate(no = row_number())##create index for joins | |
##-- Get IP geolocation of members --## | |
geolocation <- | |
rgeolocate::ip2location(members$ip_address, file = "IP2LOCATION-LITE-DB1.BIN", c("country_code", "country_name")) %>% | |
mutate(no = row_number())##create index for joins | |
##------------------------## | |
members <- | |
left_join(members, geolocation, by = "no") ##Join to geolocation to members df | |
members <- | |
members[, c(1:6, 96, 97, 7:75)] #re-order dataframe columns for easier reading | |
##Get forum messages from archive.org csv | |
messages <- | |
read_csv("https://archive.org/download/iron_march_201911/csv/core_message_posts.csv") %>% | |
mutate(join_no = row_number()) %>% | |
mutate(member_id = msg_author_id) ##create index for joins to members df | |
message_geolocation <- | |
rgeolocate::ip2location(o_members$ip_address, | |
file = "IP2LOCATION-LITE-DB1.BIN", | |
c("country_code", "country_name")) %>% | |
mutate(join_no = row_number()) %>% ##create index for joins | |
rename(msg_country_code = country_code, msg_country_name = country_name) ##adjust column names for join | |
messages <- | |
left_join(messages, message_geolocation, by = "join_no") %>% ## join geolocation to messages | |
left_join(members[, c(1:8)], by = "member_id")## join file for member details | |
#convert sql dates - this isn't really required for the mapping - but makes reading the data using View() more pleasant | |
messages$joined <- sql_date(messages$joined) | |
messages$msg_date <- sql_date(messages$msg_date) | |
## cleanup missing data - | |
messages$msg_country_code <- | |
coalesce(messages$msg_country_code, messages$country_code) %>% ## replace message failed IP location with member IP location | |
coalesce("US") ## where there is no IP location, convert to US (NB: If the no of NAs is very large, may be better to simply filter those rows out ) | |
messages$country_code <- | |
coalesce(messages$country_code, "US") ## ## where there is no IP location, convert to US (NB: If the no of NAs is very large, may be better to simply filter those rows out ) | |
message_mapping <- messages %>% | |
mutate(iso_a2 = country_code) %>% ##iso_a2 is 2 letter iso country name | |
mutate(iso_a2 = recode( | |
iso_a2, | |
##{choroplethr} data for country.map seems to be missing info for any island nations. | |
"SG" = "MY", | |
##Ugly hack to avoid failed mapping by allocating to closest country. | |
"PR" = "US", | |
"MT" = "IT", | |
"PF" = "NZ" | |
)) %>% | |
group_by(iso_a2) %>% | |
mutate(value = n()) %>% #Count no of posts per region | |
ungroup() | |
message_mapping <- | |
message_mapping[!duplicated(message_mapping["iso_a2"]), ] #remove duplicates - make df smaller. | |
message_mapping <- | |
left_join(message_mapping, country.map[, c("iso_a2", "region")], by = "iso_a2")# join to country map to get choropleth regions | |
message_mapping <- | |
message_mapping[!duplicated(message_mapping["iso_a2"]), ]#remove duplicate regions - choroplethr::country.map() requires unique entries for regions variable | |
##Create a world map of the count of forum posts | |
country_choropleth(message_mapping, | |
title = "IP Geolocation of messages on Iron March forums 2011 - 2017", | |
legend = "No. of forum posts") | |
##-------------------------------------------------------------------------------------------------------------------------------------# | |
##rinse and repeat the above, but rather than allocating messages to msg IP, allocating to member IP (i.e. the IP recorded when joining) | |
message_mapping2 <- messages %>% | |
mutate(iso_a2 = country_code) %>% | |
mutate(iso_a2 = recode( | |
iso_a2, | |
"SG" = "MY", | |
"PR" = "US", | |
"MT" = "IT", | |
"PF" = "NZ" | |
)) %>% | |
group_by(iso_a2) %>% | |
mutate(value = n()) %>% | |
ungroup() | |
message_mapping2 <- | |
message_mapping2[!duplicated(message_mapping2["iso_a2"]), ] | |
message_mapping2 <- | |
left_join(message_mapping2, country.map[, c("iso_a2", "region")], by = "iso_a2") | |
message_mapping2 <- | |
message_mapping2[!duplicated(message_mapping2["iso_a2"]), ] | |
country_choropleth(message_mapping2, | |
title = "IP Geolocation forum users and volume of messages on Iron March forums 2011 - 2017", | |
legend = "No. of forum posts") | |
##---------------------------------------------------------------------------------------------------------------------------------------# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment