Scrape em tabelas
Exercícios
Diretrizes gerais:
- Instale o pacote
tabulizer
devtools::install_github(c("ropensci/tabulizerjars", "ropensci/tabulizer"))## Using github PAT from envvar GITHUB_PAT## Downloading GitHub repo ropensci/tabulizerjars@HEAD##
## * checking for file ‘/private/var/folders/kv/97__br293mdbgmqfdkvcq0z00000gn/T/RtmpEWQOcg/remotes13e6f38d563a5/ropensci-tabulizerjars-d1924e0/DESCRIPTION’ ... OK
## * preparing ‘tabulizerjars’:
## * checking DESCRIPTION meta-information ... OK
## * checking for LF line-endings in source and make files and shell scripts
## * checking for empty or unneeded directories
## * building ‘tabulizerjars_1.0.1.tar.gz’## Downloading GitHub repo ropensci/tabulizer@HEAD##
## * checking for file ‘/private/var/folders/kv/97__br293mdbgmqfdkvcq0z00000gn/T/RtmpEWQOcg/remotes13e6f4da5f2ac/ropensci-tabulizer-3049a79/DESCRIPTION’ ... OK
## * preparing ‘tabulizer’:
## * checking DESCRIPTION meta-information ... OK
## * checking for LF line-endings in source and make files and shell scripts
## * checking for empty or unneeded directories
## Removed empty directory ‘tabulizer/docs’
## * building ‘tabulizer_0.2.2.tar.gz’- Baixe o arquivo .Rmd e o folder com os arquivos em .pdf e abra no RStudio.
Siga as diretrizes da atividade.
Rode o arquivo .Rmd por meio do ícone
knitrSalve o .Rmd e submeta-o por meio do email renataoliveira@gmail.com.
library(hrbrthemes)
library(ggplot2)
library(Cairo)
library(extrafont)
library(rJava) # Needed for tabulizer
library(tabulizer) # Handy tool for PDF Scraping
library(tidyverse) # Core data manipulation and visualization libraries
library(janitor)
extrafont::loadfonts()Este chunk serve para fazer a leitura dos arquivos
# PDF Scrape Tables
#Store folder of pdf to be loaded
folder <- "pdf/"
# Number of files in folder
num_files <- length(dir(folder, pattern="*.pdf"))
# Generate file list of shapefiles
files <- as.data.frame(dir(folder, pattern="*.pdf"))
names(files) <- "data"
## Generate static maps
tmp <- data.frame(bairro = NA , sg = NA, srag = NA, obitos = NA, data = NA)
for (i in 1:num_files) {
file_name <- files[i,1]
data <- print(str_sub(files[i,1], -14, -7))
scrape <- tabulizer::extract_tables(file = paste0("pdf/", files[i,1]), output = "data.frame", page = 2, guess = TRUE, method = "stream")
scrape1 <- as.data.frame(scrape[[1]])
scrape1 <- scrape1 %>%
separate(col = 'Síndrome.Respiratória.Aguda.Grave', into = c("total de casos", "óbitos"), sep = ' ') %>%
slice(-1)
scrape1 <- scrape1[,c(1,3,5,6)]
names(scrape1) <- c("bairro", "sg", "srag", "obitos")
if (length(scrape) != 1) {
scrape2 <- as.data.frame(scrape[[2]])
scrape2[5,] <- colnames(scrape2)
names(scrape2) <- c("bairro", "sg", "srag", "obitos")
scrape2 <- scrape2 %>%
filter(bairro != "TOTAL")
scrape2 <- scrape2 %>%
mutate(bairro = str_replace_all(bairro, "\\.", " "), sg = str_replace(sg, "X", ""), srag = str_replace(srag, "X", ""), obitos = str_replace(obitos, "X", ""))
tmp_dia <- rbind(scrape1, scrape2)
tmp_dia[,5] <- data
names(tmp_dia) <- c("bairro", "sg", "srag", "obitos", "data")
} else {
tmp_dia <- scrape1
tmp_dia[,5] <- data
names(tmp_dia) <- c("bairro", "sg", "srag", "obitos", "data")
}
tmp <- tmp %>%
bind_rows(tmp, tmp_dia) %>%
filter(!is.na(sg))
}