Lab 5.2

Scrape em tabelas

Exercícios

Diretrizes gerais:

Instale o pacote tabulizer

devtools::install_github(c("ropensci/tabulizerjars", "ropensci/tabulizer"))

## Using github PAT from envvar GITHUB_PAT

## Downloading GitHub repo ropensci/tabulizerjars@HEAD

## 
## * checking for file ‘/private/var/folders/kv/97__br293mdbgmqfdkvcq0z00000gn/T/RtmpEWQOcg/remotes13e6f38d563a5/ropensci-tabulizerjars-d1924e0/DESCRIPTION’ ... OK
## * preparing ‘tabulizerjars’:
## * checking DESCRIPTION meta-information ... OK
## * checking for LF line-endings in source and make files and shell scripts
## * checking for empty or unneeded directories
## * building ‘tabulizerjars_1.0.1.tar.gz’

## Downloading GitHub repo ropensci/tabulizer@HEAD

## 
## * checking for file ‘/private/var/folders/kv/97__br293mdbgmqfdkvcq0z00000gn/T/RtmpEWQOcg/remotes13e6f4da5f2ac/ropensci-tabulizer-3049a79/DESCRIPTION’ ... OK
## * preparing ‘tabulizer’:
## * checking DESCRIPTION meta-information ... OK
## * checking for LF line-endings in source and make files and shell scripts
## * checking for empty or unneeded directories
## Removed empty directory ‘tabulizer/docs’
## * building ‘tabulizer_0.2.2.tar.gz’

Baixe o arquivo .Rmd e o folder com os arquivos em .pdf e abra no RStudio.

Arquivo .Rmd

Arquivos .pdf

Siga as diretrizes da atividade.
Rode o arquivo .Rmd por meio do ícone knitr
Salve o .Rmd e submeta-o por meio do email renataoliveira@gmail.com.

library(hrbrthemes)
library(ggplot2)
library(Cairo)
library(extrafont)
library(rJava)      # Needed for tabulizer
library(tabulizer)  # Handy tool for PDF Scraping
library(tidyverse)  # Core data manipulation and visualization libraries
library(janitor)

extrafont::loadfonts()

Este chunk serve para fazer a leitura dos arquivos

# PDF Scrape Tables

#Store folder of pdf to be loaded
folder <- "pdf/"

# Number of files in folder
num_files <- length(dir(folder, pattern="*.pdf"))

# Generate file list of shapefiles
files <- as.data.frame(dir(folder, pattern="*.pdf"))
names(files) <- "data"

## Generate static maps 

tmp <- data.frame(bairro = NA , sg = NA, srag = NA, obitos = NA, data = NA)

for (i in 1:num_files) {
   file_name <- files[i,1]
   data <- print(str_sub(files[i,1], -14, -7))
   scrape <- tabulizer::extract_tables(file = paste0("pdf/", files[i,1]), output = "data.frame", page = 2, guess = TRUE, method = "stream")
   scrape1 <- as.data.frame(scrape[[1]])
   scrape1 <- scrape1 %>% 
      separate(col = 'Síndrome.Respiratória.Aguda.Grave', into = c("total de casos", "óbitos"), sep = ' ') %>% 
      slice(-1)
   scrape1 <- scrape1[,c(1,3,5,6)] 
   names(scrape1) <- c("bairro", "sg", "srag", "obitos")
   if (length(scrape) != 1) {
      scrape2 <- as.data.frame(scrape[[2]])
      scrape2[5,] <- colnames(scrape2)
      names(scrape2) <- c("bairro", "sg", "srag", "obitos")
      scrape2 <- scrape2 %>% 
         filter(bairro != "TOTAL") 
      scrape2 <- scrape2 %>% 
         mutate(bairro = str_replace_all(bairro, "\\.", " "), sg = str_replace(sg, "X", ""), srag = str_replace(srag, "X", ""), obitos = str_replace(obitos, "X", ""))
      tmp_dia <- rbind(scrape1, scrape2)
      tmp_dia[,5] <- data
      names(tmp_dia) <- c("bairro", "sg", "srag", "obitos", "data")
   } else {
      tmp_dia <- scrape1
      tmp_dia[,5] <- data
      names(tmp_dia) <- c("bairro", "sg", "srag", "obitos", "data")
   }   
   
   tmp <- tmp %>% 
      bind_rows(tmp, tmp_dia) %>% 
      filter(!is.na(sg))
   
}

Last updated on Oct 15, 2021