thomasjensen

## extract.py
from BeautifulSoup import BeautifulSoup
import os
import re

path = "/Users/thomasjensen/Documents/RBloggersScrape/download"

listing = os.listdir(path)
listing = [name for name in listing if re.search(r"post\d+\.html",name) != None]

os.chdir(path)

## rbloggerAnalysis.r
#read the libraries
library(plyr)
library(ggplot2)
library(xtable)

#set the working direcotry to where you saved the output.csv file from the previous post
setwd("/.../")

#read the data
data <- read.csv("output.csv")

## textmining.r
##read in the libraries and set the working directory
library(tm)
library(corrplot)
setwd("/path/to/")

##read in the data and subset it to the relevant categories
data <- read.csv("indvandringPolitikken.csv", fileEncoding = "latin1")
data <- data[data$kategori == "Politik" | data$kategori == "Debat" | data$kategori == "Kronikken" | data$kategori == "Leder", ]

##create the corpus and clean it

## ebtrust.r
library(ggplot2)

setwd("/path/to/file/")

data <- read.csv("ebAll.csv")

data$date <- as.Date(data$date, format = "%y/%m/%d")
data$Tend.to.trust <- as.numeric(gsub("%","",data$Tend.to.trust))
data$Tend.not.to.trust <- as.numeric(gsub("%","",data$Tend.not.to.trust))
data$DK...Don.t.know <- as.numeric(gsub("%","",data$DK...Don.t.know))

## simconf.r
#set the working directoy and read the foreign library
setwd("/.../")
library(foreign)

#read the data and remove missing values of the dependent variable
data <- read.dta("repdata.dta")
data <- data[data$onset != 4,]

#estimate the model
model <- glm(onset ~ warl + gdpenl + lpopl1 + lmtnest + ncontig + Oil + nwstate + instab + polity2l + ethfrac + relfrac, data = data, family = "binomial")
	from BeautifulSoup import BeautifulSoup
	import os
	import re

	path = "/Users/thomasjensen/Documents/RBloggersScrape/download"

	listing = os.listdir(path)
	listing = [name for name in listing if re.search(r"post\d+\.html",name) != None]

	os.chdir(path)
	#read the libraries
	library(plyr)
	library(ggplot2)
	library(xtable)

	#set the working direcotry to where you saved the output.csv file from the previous post
	setwd("/.../")

	#read the data
	data <- read.csv("output.csv")
	##read in the libraries and set the working directory
	library(tm)
	library(corrplot)
	setwd("/path/to/")

	##read in the data and subset it to the relevant categories
	data <- read.csv("indvandringPolitikken.csv", fileEncoding = "latin1")
	data <- data[data$kategori == "Politik" \| data$kategori == "Debat" \| data$kategori == "Kronikken" \| data$kategori == "Leder", ]

	##create the corpus and clean it
	library(ggplot2)

	setwd("/path/to/file/")

	data <- read.csv("ebAll.csv")

	data$date <- as.Date(data$date, format = "%y/%m/%d")
	data$Tend.to.trust <- as.numeric(gsub("%","",data$Tend.to.trust))
	data$Tend.not.to.trust <- as.numeric(gsub("%","",data$Tend.not.to.trust))
	data$DK...Don.t.know <- as.numeric(gsub("%","",data$DK...Don.t.know))
	#set the working directoy and read the foreign library
	setwd("/.../")
	library(foreign)

	#read the data and remove missing values of the dependent variable
	data <- read.dta("repdata.dta")
	data <- data[data$onset != 4,]

	#estimate the model
	model <- glm(onset ~ warl + gdpenl + lpopl1 + lmtnest + ncontig + Oil + nwstate + instab + polity2l + ethfrac + relfrac, data = data, family = "binomial")