Skip to content

Instantly share code, notes, and snippets.

@AndreaCirilloAC
Created September 29, 2016 10:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AndreaCirilloAC/05acee1dd6a921048efa7ca89750fff0 to your computer and use it in GitHub Desktop.
Save AndreaCirilloAC/05acee1dd6a921048efa7ca89750fff0 to your computer and use it in GitHub Desktop.
quick and dirty code to extraplate text from html file with custom tag
library(rio)
library(dplyr)
read_file = function(file_path) {
file_connection <- file(file_path, "r")
read_data <- c()
while ( TRUE ) {
line <- readLines(file_connection, n = 1)
if ( length(line) == 0 ) {
break
}
read_data <- c(read_data,line)
}
close(file_connection)
return(read_data)
}
text_from_html <- function(html_file_path ,txt_file_destination){
data_vector <- (read_file(html_file_path)) #vector of all lines within rmd file
patterns_vector <- c(
'<p class="ti-section-1"',
'<p class="ti-section-2"',
'<span class="bold">',
'<p class="ti-art"',
'<p class="sti-art">',
'<span class="italic">'
)
complete <- c()
for (i in 1:length(patterns_vector)){
extremes <- grep(patterns_vector[i],data_vector )
print(extremes)
complete <- c(complete, extremes)
}
complete %>% unique() %>% sort() -> index_vector
text <- data_vector[index_vector] #filter read file for line of r codes
patterns_vector <- c(
'<p class="ti-section-1"',
'<p class="ti-section-2"',
'<span class="bold">',
'<p class="ti-art"',
'<p class="sti-art">',
'<span class="italic">',
'</p>',
'-1-1',
'id="d1e',
'"L_2013176IT.01000101-d-',
'">',
'</span>',
'<p class="tbl-txt'
)
for (i in 1:length(patterns_vector)){
text <- gsub(patterns_vector[i],"",text)
}
print(text)
write(text,txt_file_destination)
}
text_from_html("L_2013176IT.01000101.xml.html",txt_file_destination = "report.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment