Last active
December 1, 2016 19:08
-
-
Save jspaezp/af7782f96d38737145b52a459a75046d to your computer and use it in GitHub Desktop.
Remover_lineas_indeseadas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# La solucion tiene que ser solo desde R ?? | |
# en caso de que no, puedes remover dichas lineas con grep y cat en unix | |
# en el caso de que si ..... yo primero leeria todos los datos por linea (sep = "\m") | |
# despues usaria una expresion regular para sacar los que no quiero (dplyr::filter(grepl())) | |
# y despues los separaria para que quedaran como los quiero. (tidyr::separate) | |
# Primero leemos los datos usando de separador la linea (no se separan las columnas) | |
require(tidyverse) # uso este paquete para todo .... tambien se puede hacer con R base | |
tabla_completa <- read.table("~/SST_Tmax_amj_Cusco_prob.txt", sep = "\n", as.is = TRUE) %>% | |
tbl_df() | |
######################## RESULTADO de la tabla completa | |
### A tibble: 104 × 1 | |
##V1 | |
##<chr> | |
##1 xmlns:cpt=http://iri.columbia.edu/CPT/v10/ | |
##2 cpt:ncats=3 | |
##3 cpt:field=prcp, cpt:C=1, cpt:clim_prob=0.333333333333, cpt:nrow=30, cpt:ncol=7, | |
##4 \tGRANJA\tKCAYRA\tURUBAMBA\tANTA\tANCACHURO\tPARURO\tACOMAYO | |
##5 cpt:Y\t-13.5570000000\t-13.3040000000\t-13.4680000000\t-13.7680000000\t-13.9170000000 | |
##6 cpt:X\t-71.8750000000\t-72.1250000000\t-72.2160000000\t-71.8450000000\t-71.6840000000 | |
##7 1986-04/06\t36.5774685334\t40.0417226666\t52.0471586526\t53.6844007815\t27.4341584786 | |
##8 1987-04/06\t4.31135812254\t21.1286522134\t9.66070546609\t20.7177124686\t2.55429491075 | |
##9 1988-04/06\t12.1848779602\t29.2176289354\t27.8645350556\t33.3535418833\t11.9981532698 | |
##10 1989-04/06\t77.8363503839\t58.3669823466\t88.5511720179\t84.5855644454\t73.7035487261 | |
### ... with 94 more rows | |
#esto devuelve un vector logico con todas las lineas que comienzan con un numero, y las seleccionamos | |
lineas_con_datos <- tabla_completa %>% | |
filter(grepl("^[\\d]{4}", V1,perl = TRUE)) | |
######################## RESULTADO despues de remover las lineas indeseadas | |
## # A tibble: 90 × 1 | |
## V1 | |
## <chr> | |
## 1 1986-04/06\t36.5774685334\t40.0417226666\t52.0471586526\t53.6844007815\t27.4341584786 | |
## 2 1987-04/06\t4.31135812254\t21.1286522134\t9.66070546609\t20.7177124686\t2.55429491075 | |
## 3 1988-04/06\t12.1848779602\t29.2176289354\t27.8645350556\t33.3535418833\t11.9981532698 | |
## 4 1989-04/06\t77.8363503839\t58.3669823466\t88.5511720179\t84.5855644454\t73.7035487261 | |
## 5 1990-04/06\t18.9982211911\t35.3834849714\t37.5254358901\t42.4262562644\t22.3915641959 | |
## 6 1991-04/06\t20.6420170156\t35.2785968799\t36.7974705686\t42.6057717002\t19.4096579222 | |
## 7 1992-04/06\t4.62370394314\t18.1117656420\t11.6771564910\t17.7568990424\t3.61570359941 | |
## 8 1993-04/06\t8.20665753005\t23.3036768502\t22.5636715052\t22.4831468871\t7.38365963785 | |
## 9 1994-04/06\t22.0731783854\t33.0364936792\t40.2010622657\t37.1377730735\t20.8246318356 | |
## 10 1995-04/06\t14.4263594309\t26.5684615201\t26.8680507654\t26.3415690329\t13.4765168853 | |
## # ... with 80 more rows | |
lineas_con_datos %>% | |
separate(V1, sep = "/", into = c("FECHA","RESTO")) %>% | |
separate(RESTO, sep = "\t", into = paste0("V", 1:10)) %>% | |
head() %>% | |
as.data.frame | |
######################## RESULTADO de la tabla FINAL | |
# FECHA V1 V2 V3 V4 V5 | |
# 1 1986-04 06 36.5774685334 40.0417226666 52.0471586526 53.6844007815 | |
# 2 1987-04 06 4.31135812254 21.1286522134 9.66070546609 20.7177124686 | |
# 3 1988-04 06 12.1848779602 29.2176289354 27.8645350556 33.3535418833 | |
# 4 1989-04 06 77.8363503839 58.3669823466 88.5511720179 84.5855644454 | |
# 5 1990-04 06 18.9982211911 35.3834849714 37.5254358901 42.4262562644 | |
# 6 1991-04 06 20.6420170156 35.2785968799 36.7974705686 42.6057717002 | |
# V6 V7 V8 V9 V10 | |
# 1 27.4341584786 26.2467891473 34.5204226786 <NA> <NA> | |
# 2 2.55429491075 2.28400835030 4.75621700701 <NA> <NA> | |
# 3 11.9981532698 13.3062592097 15.8955269910 <NA> <NA> | |
# 4 73.7035487261 72.2072602640 78.4357440584 <NA> <NA> | |
# 5 22.3915641959 22.6200320881 24.8973724454 <NA> <NA> | |
# 6 19.4096579222 23.0793050447 22.4510269544 <NA> <NA> | |
## EQUIVALENTE SIN PIPE | |
tabla_completa <- read.table("~/SST_Tmax_amj_Cusco_prob.txt", sep = "\n", as.is = TRUE) | |
# Grepl devuelve un vector logico (TRUE y FALSE) | |
lineas_con_datos <- tabla_completa[grepl("^[\\d]{4}", tabla_completa$V1,perl = TRUE), ] | |
# esta linea separa la fecha de el resto, si notas en el archivo estan separados un un / , | |
# el cual aca esta especificado en "/" | |
datos_intermedios <- separate(lineas_con_datos, V1, sep = "/", into = c("FECHA","RESTO")) | |
#esta linea separa por las tabulaciones, las cuales en texto se expresan como '\t' | |
datos_finales <- separate(datos_intermedios, RESTO, sep = "\t", into = paste0("V", 1:10)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment