library(tidyverse)
library(magrittr)
#library(readr)
library(googlesheets4)
library(readODS)
library(readxl)
library(jsonlite)
list.dirs(path = mypath)
[1] "/home/cinkova/manage_files/"
[2] "/home/cinkova/manage_files//JSONFILES"
list.files(path = mypath)
[1] "2021-05-07_corrected_tab01.tsv" "bezhlavy_iris.csv"
[3] "ChickWeight.txt" "corrected_tab01.tsv"
[5] "gutenberg01.txt" "gutenbergxml.xml"
[7] "iris.csv" "JSONFILES"
[9] "managing_files_01.nb.html" "managing_files_01.Rmd"
[11] "mtcars.csv" "ockovani.json"
[13] "tab01" "tab01.alfa"
[15] "tab01.csv" "tab01.ods"
[17] "tab01.xlsx" "ToothGrowth.divnej"
[19] "tsv_tabka.tsv"
znaky
[1] "2021-05-07_corrected_tab01.tsv" "bezhlavy_iris.csv"
[3] "ChickWeight.txt" "corrected_tab01.tsv"
[5] "gutenberg01.txt" "gutenbergxml.xml"
[7] "iris.csv" "JSONFILES"
[9] "managing_files_01.nb.html" "managing_files_01.Rmd"
[11] "mtcars.csv" "ockovani.json"
[13] "tab01" "tab01.alfa"
[15] "tab01.csv" "tab01.ods"
[17] "tab01.xlsx" "ToothGrowth.divnej"
[19] "tsv_tabka.tsv"
list.files(path = mypath, full.names = TRUE)
list.files(path = mypath, recursive = TRUE, include.dirs = FALSE, full.names = TRUE ) #tady include.dirs = FALSE funguje
[1] "/home/cinkova/manage_files//2021-05-07_corrected_tab01.tsv"
[2] "/home/cinkova/manage_files//bezhlavy_iris.csv"
[3] "/home/cinkova/manage_files//ChickWeight.txt"
[4] "/home/cinkova/manage_files//corrected_tab01.tsv"
[5] "/home/cinkova/manage_files//gutenberg01.txt"
[6] "/home/cinkova/manage_files//gutenbergxml.xml"
[7] "/home/cinkova/manage_files//iris.csv"
[8] "/home/cinkova/manage_files//JSONFILES/tabulka_pravnicky_text.json"
[9] "/home/cinkova/manage_files//managing_files_01.nb.html"
[10] "/home/cinkova/manage_files//managing_files_01.Rmd"
[11] "/home/cinkova/manage_files//mtcars.csv"
[12] "/home/cinkova/manage_files//ockovani.json"
[13] "/home/cinkova/manage_files//tab01"
[14] "/home/cinkova/manage_files//tab01.alfa"
[15] "/home/cinkova/manage_files//tab01.csv"
[16] "/home/cinkova/manage_files//tab01.ods"
[17] "/home/cinkova/manage_files//tab01.xlsx"
[18] "/home/cinkova/manage_files//ToothGrowth.divnej"
[19] "/home/cinkova/manage_files//tsv_tabka.tsv"
my_plaintables <- list.files(path = mypath, pattern = "\\.csv" )
my_plaintables
[1] "bezhlavy_iris.csv" "iris.csv" "mtcars.csv"
[4] "tab01.csv"
Capture more plain text formats with one regular expression: csv, tsv, txt
my_plaintables <- list.files(path = mypath, pattern = "(\\.[ct]sv|\\.txt)" )
my_plaintables
[1] "2021-05-07_corrected_tab01.tsv" "bezhlavy_iris.csv"
[3] "ChickWeight.txt" "corrected_tab01.tsv"
[5] "gutenberg01.txt" "iris.csv"
[7] "mtcars.csv" "tab01.csv"
[9] "tsv_tabka.tsv"
Learn to write regular expressions e.g. here: (https://regexone.com/lesson/introduction_abcs)
Fig. 1
"Height","Weight","FirstName","Surname","Visit"
"165","55 ","Hana ","Nova",2010-10-10
"145","43","Anna","Kriva",2017-08-09
"173","87","Jakub","Polak",2021-01-27
"88","32","Josef","Riha","2008=12-04"
Fig. 2
"Height";"Weight";"FirstName";"Surname";"Visit"
"165";"55 ";"Hana ";"Nova";2010-10-10
"145";"43";"Anna";"Kriva";2017-08-09
"173";"87";"Jakub";"Polak";2021-01-27
"88";"32";"Josef";"Riha";"2008=12-04"
Fig. 3
[
{
"Sepal.Length": 6.2,
"Sepal.Width": 3.4,
"Petal.Length": 5.4,
"Petal.Width": 2.3,
"Species": "virginica"
},
{
"Sepal.Length": 5.9,
"Sepal.Width": 3,
"Petal.Length": 5.1,
"Petal.Width": 1.8,
"Species": "virginica"
}
]
Fig. 4
"Height"\t"Weight"\t"FirstName"\t"Surname"\t"Visit"
"165"\t"55 "\t"Hana "\t"Nova"\t2010-10-10
"145"\t"43"\t"Anna"\t"Kriva"\t2017-08-09
"173"\t"87"\t"Jakub"\t"Polak"\t2021-01-27
"88"\t"32"\t"Josef"\t"Riha"\t"2008=12-04"
Fig. 5
"Height" "Weight" "FirstName" "Surname" "Visit"
"165" "55 " "Hana " "Nova" 2010-10-10
"145" "43" "Anna" "Kriva" 2017-08-09
"173" "87" "Jakub" "Polak" 2021-01-27
"88" "32" "Josef" "Riha" "2008=12-04"
readr::read_csv(file = "iris.csv", n_max = 5)
Parsed with column specification:
cols(
Sepal.Length = col_double(),
Sepal.Width = col_double(),
Petal.Length = col_double(),
Petal.Width = col_double(),
Species = col_character()
)
readr::read_csv("iris.csv", col_names = c("SL", "SW", "PL", "PW", "Spec"),skip = 1, n_max = 5)
Parsed with column specification:
cols(
SL = col_double(),
SW = col_double(),
PL = col_double(),
PW = col_double(),
Spec = col_character()
)
readr::read_csv("iris.csv", col_names = FALSE, skip = 1, n_max = 5)
Parsed with column specification:
cols(
X1 = col_double(),
X2 = col_double(),
X3 = col_double(),
X4 = col_double(),
X5 = col_character()
)
readr::read_csv("iris.csv", col_names = c("SL", "SW", "PL", "PW", "Spec"), n_max = 5)
readr::read_csv2("tab01")
readr::read_csv2("tab01", quote = "\"") #play around with quoting characters
Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
Parsed with column specification:
cols(
Height = col_double(),
Weight = col_double(),
FirstName = col_character(),
Surname = col_character(),
Visit = col_character()
)
readr::read_lines("bezhlavy_iris.csv", n_max = 5)
[1] "5.1,3.5,1.4,0.2,setosa" "4.9,3,1.4,0.2,setosa" "4.7,3.2,1.3,0.2,setosa"
[4] "4.6,3.1,1.5,0.2,setosa" "5,3.6,1.4,0.2,setosa"
#how do we read in the file above? try and find out on your own
readr::read_csv("bezhlavy_iris.csv", col_names = FALSE, n_max = 3)
Parsed with column specification:
cols(
X1 = col_double(),
X2 = col_double(),
X3 = col_double(),
X4 = col_double(),
X5 = col_character()
)
readr::read_csv("bezhlavy_iris.csv", col_names = TRUE, n_max = 3)
Parsed with column specification:
cols(
`5.1` = col_double(),
`3.5` = col_double(),
`1.4` = col_double(),
`0.2` = col_double(),
setosa = col_character()
)
read_delimreadr::read_lines(file = "tab01.alfa")
[1] "\"Height\"&\"Weight\"&\"FirstName\"&\"Surname\"&\"Visit\""
[2] "\"165\"&\"55 \"&\"Hana \"&\"Nova\"&2010-10-10"
[3] "\"145\"&\"43\"&\"Anna\"&\"Kriva\"&2017-08-09"
[4] "\"173\"&\"87\"&\"Jakub\"&\"Polak\"&2021-01-27"
[5] "\"88\"&\"32\"&\"Josef\"&\"Riha\"&\"2008=12-04\""
(tabka <- readr::read_delim(file = "tab01.alfa", delim = "&"))
Parsed with column specification:
cols(
Height = col_double(),
Weight = col_character(),
FirstName = col_character(),
Surname = col_character(),
Visit = col_character()
)
Save the tabka table as a tsv
readr::write_tsv(x = tabka, path = "tsv_tabka.tsv")
read it in as a tsv again
tabka2 <- readr::read_tsv("tsv_tabka.tsv")
Parsed with column specification:
cols(
Height = col_double(),
Weight = col_double(),
FirstName = col_character(),
Surname = col_character(),
Visit = col_character()
)
tabka2
Display tabka2. Can you see any strange value that is likely to be wrong? What data types you would expect for the individual columns?
tabka2
nacteno <- readr::read_tsv("tsv_tabka.tsv", #col_types = list(col_double(),
# col_double(),
# col_character(),
# col_character(),
# col_date())
col_types = "ddccD"
)#dopis typy sloupcu
1 parsing failure.
row col expected actual file
4 Visit date like 2008=12-04 'tsv_tabka.tsv'
nacteno
readr::parse_date(tabka$Visit)
1 parsing failure.
row col expected actual
4 -- date like 2008=12-04
[1] "2010-10-10" "2017-08-09" "2021-01-27" NA
readr::parse_character(as.character(tabka$Visit))
[1] "2010-10-10" "2017-08-09" "2021-01-27" "2008=12-04"
#library(magrittr)
readr::parse_double(tabka$Weight)# %T>% str() # returns the value as well as prints the structure (a magrittr pipe)
[1] 55 43 87 32
weight_by_pipe <- readr::parse_double(as.character(tabka$Height)) %T>% str()
num [1:4] 165 145 173 88
weight_by_pipe
[1] 165 145 173 88
readr::parse_double(as.character(tabka$Height)) #x musi byt znak. vektor. Kdyz to zlobi, musime ho z nej udelat
str(tabka$Height)
str(tabka)
str(nacteno)
Correct the date right in R using the base R what you learned long ago
tabka$Visit[4] <- "2008-12-04"
Save the corrected file as corrected_tab01.tsv
todays_filename
[1] "2021-05-07_corrected_tab01.tsv"
Now save today’s file version
readr::write_tsv(tabka, path = todays_filename)
This is good e.g. when you run a script daily, for instance if you want to keep track of the Czech covid-19 vaccination statistics:
vac <- readr::read_csv(file = "https://onemocneni-aktualne.mzcr.cz/api/v2/covid-19/ockovani.csv")
Parsed with column specification:
cols(
datum = col_date(format = ""),
vakcina = col_character(),
kraj_nuts_kod = col_character(),
kraj_nazev = col_character(),
vekova_skupina = col_character(),
prvnich_davek = col_double(),
druhych_davek = col_double(),
celkem_davek = col_double()
)
dplyr::glimpse(vac)
Rows: 43,941
Columns: 8
$ datum <date> 2020-12-27, 2020-12-27, 2020-12-27, 2020-12-27, 2020-12-27, 2020-12-…
$ vakcina <chr> "Comirnaty", "Comirnaty", "Comirnaty", "Comirnaty", "Comirnaty", "Com…
$ kraj_nuts_kod <chr> "CZ010", "CZ010", "CZ010", "CZ010", "CZ010", "CZ010", "CZ010", "CZ010…
$ kraj_nazev <chr> "Hlavní město Praha", "Hlavní město Praha", "Hlavní město Praha", "Hl…
$ vekova_skupina <chr> "18-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59…
$ prvnich_davek <dbl> 48, 108, 102, 111, 172, 156, 128, 96, 84, 79, 48, 19, 24, 2, 3, 7, 8,…
$ druhych_davek <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ celkem_davek <dbl> 48, 108, 102, 111, 172, 156, 128, 96, 84, 79, 48, 19, 24, 2, 3, 7, 8,…
Kolik vyočkovaly jednotlivé kraje celkem dávek? Vytvořte agregovanou tabulku a uložte si ji do souboru pod názvem dnesnidatum_AgregOckovani.tsv. (Dnešní datum pomocí Sys.Date())
dplyr::group_by(vac, kraj_nazev) %>% dplyr::summarize(sum(celkem_davek)) %>%
readr::write_csv(path = vaccination_filename)
`summarise()` ungrouping output (override with `.groups` argument)
Diskutujte - neprogramujte: jak byste si vyrobili tabulku s počtem dávek podle krajů za celý měsíc od zítřka? Kam byste si ukládali data? Spojovali byste soubory? Nemusíte to umět udělat, jen popřemýšlejte.
#Uložte si jakýkoli soubor z webu domů:
download.file(url = "https://onemocneni-aktualne.mzcr.cz/api/v2/covid-19/ockovani.json",
destfile = "ockovani.json")
trying URL 'https://onemocneni-aktualne.mzcr.cz/api/v2/covid-19/ockovani.json'
Content type 'application/json; charset=utf-8' length 13887649 bytes (13.2 MB)
==================================================
downloaded 13.2 MB
download.file(url = "https://www.gutenberg.org/files/26184/26184-8.txt",
destfile = "gutenberg01.txt")
trying URL 'https://www.gutenberg.org/files/26184/26184-8.txt'
Content type 'text/plain' length 75158 bytes (73 KB)
==================================================
downloaded 73 KB
readr::read_lines(file = "gutenberg01.txt", n_max = 5)
[1] "The Project Gutenberg eBook, Simple Sabotage Field Manual, by Strategic"
[2] "Services"
[3] ""
[4] "This eBook is for the use of anyone anywhere at no cost and with"
[5] "almost no restrictions whatsoever. You may copy it, give it away or"
jsonlite::fromJSON("ockovani.json")
$modified
[1] "2021-05-07T08:07:35+02:00"
$source
[1] "https://onemocneni-aktualne.mzcr.cz/"
$data
NA
TootGrowth.divnej jako co nejlepší tabulku.readODS a načtěte soubor tab01.ods (z tabulkového procesoru Calc v Office Libre)readxl a načtěte soubor tab01.xlsx