修訂. | b0f9052445f388d368bbf5d07343d0aa202d43a7 |
---|---|
大小 | 15,377 bytes |
時間 | 2022-02-17 01:05:51 |
作者 | Lorenzo Isella |
Log Message | Another flipbook but for the covid tracker -- work in progress! |
---
title: "Covid Tracker Data Processing"
subtitle: "A Step-by-step Guide"
author: "Lorenzo Isella"
output:
xaringan::moon_reader:
lib_dir: libs
css: [default, hygge, ninjutsu]
nature:
ratio: 16:10
highlightStyle: github
highlightLines: true
countIncrementalSlides: false
---
```{r setup, include = FALSE}
library(flipbookr)
library(tidyverse)
knitr::opts_chunk$set(fig.width = 6, message = FALSE,
warning = FALSE, comment = "",
cache = F)
```
```{css, eval = TRUE, echo = FALSE}
.remark-code{line-height: 1.5; font-size: 80%}
@media print {
.has-continuation {
display: block;
}
}
```
```{r intro, include = FALSE}
library(tidyverse)
library(janitor)
library(lubridate)
library(openxlsx)
library(stringr)
library(formattable)
library(stringi)
library(kableExtra)
library(flextable)
source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")
```
---
First we load the libraries and thus set up the working environment
---
`r chunk_reveal(chunk_name = "intro" ,
title = "### Load the libraries and select some years")`
---
We set up vectors and tibbles with MS and years that we need to
process the data
---
```{r ms, include = FALSE}
exclude_ms <- c("UK")
ms_list <- c("AT", "BE", "BG" ,
"CY" , "CZ" , "DE",
"DK" , "EE", "EL" ,
"ES", "FI", "FR" ,
"HR" , "HU" , "IE" ,
"IT" , "LT" , "LU",
"LV" , "MT", "NL" ,
"PL" , "PT" , "RO",
"SE" , "SI" , "SK")
year_focus <- c(2019,2020,2021)
table_generation <- 1 ## whether I should generate the tables or not
names <- tibble(iso2= c("AT", "BE", "BG" , "CY" , "CZ" , "DE",
"DK" , "EE", "EL" , "ES", "FI", "FR" ,
"HR" , "HU" , "IE" , "IT" , "LT" , "LU",
"LV" , "MT", "NL" ,"PL" , "PT" , "RO",
"SE" , "SI" , "SK", "UK" , "Total"),
country=c("Austria", "Belgium", "Bulgaria","Cyprus", "Czechia", "Germany",
"Denmark","Estonia", "Greece","Spain","Finland", "France",
"Croatia","Hungary", "Ireland", "Italy","Lithuania", "Luxembourg",
"Latvia","Malta", "Netherlands","Poland","Portugal", "Romania",
"Sweden","Slovenia", "Slovakia", "United Kingdom","Total EU"))
names%>% glimpse
```
---
`r chunk_reveal(chunk_name = "ms" ,
title = "### Set up the lists of MS")`
---
Then create two vectors with the different spelling of the TF (with and
without colon) that we will need when we collect the stats on the TF
chapters.
---
```{r tf, include = FALSE}
tf_list <- c("TF1:", "TF2:", "TF3:", "TF4:",
"TF6:", "TF7:", "TF8:",
"TF9:", "TF10:", "TF11:", "TF12:",
"TF13:", "TF14:")
tf_list2 <- c("TF1,", "TF2", "TF3", "TF4",
"TF6", "TF7", "TF8",
"TF9", "TF10", "TF11", "TF12",
"TF13", "TF14")
tf_list
tf_list2
```
---
`r chunk_reveal(chunk_name = "tf" ,
title = "### Set up the lists of for the TF")`
---
Now I read the amendments and decisions from the excel file
---
```{r ini_import, include = FALSE}
df_am_ini <- read.xlsx("./input/covid_tracker.xlsx", sheet="Amendments") %>%
as_tibble()
df_am_ini
df_dec_ini <- read.xlsx("./input/covid_tracker.xlsx", sheet="Decisions") %>%
as_tibble()
df_dec_ini%>% glimpse
```
---
`r chunk_reveal(chunk_name = "ini_import" ,
title = "### Raw data for amendments and decisions")`
---
Clean the amendment and decision data and save them in an R friendly
format (RDS file)
---
```{r clean_save, include = FALSE}
df_am <- df_am_ini %>%
clean_data() %>% ### clean the amendments data
filter(member_state_of_amendment %!in% exclude_ms)%>%
pattern_to_na(":") %>%
mutate(decision_date=if_else(is.na(decision_date_manually_encoded),
decision_date_from_isis_not_accurate,
decision_date_manually_encoded))
df_am%>% glimpse
saveRDS(df_am, "./intermediate_files/amendments_data.RDS")
amendments_decision_date <- df_am %>%
select(amendment_case_number, decision_date) %>%
distinct()
df_dec <- df_dec_ini %>%
clean_data() %>% ### clean the decision data
filter(member_state_code %!in% exclude_ms)%>%
pattern_to_na(":")
df_dec%>% glimpse
saveRDS(df_dec,"./intermediate_files/decisions_data.RDS")
```
---
`r chunk_reveal(chunk_name = "clean_save" ,
title = "### Clean and save amendments and decisions")`
---
Read the data from the report1 sheet, clean it and deduplicate it when necessary
in order to reproduce the sheet report1_unique. Save the cleaned data
both as an RDS and an Excel file.
---
```{r report, include = FALSE}
rep1_ini <- read.xlsx("./input/covid_tracker.xlsx", sheet="Report 1 - detailed view") %>%
as_tibble()
rep1 <- rep1_ini %>%
clean_data() %>% ## clean the data of the full report
distinct() %>%
filter(member_state_2_letter_code %!in% exclude_ms) %>%
mutate(confirmed_budgets=as.numeric(confirmed_budgets)) %>%
mutate(decision_date=as.integer(decision_date)) %>%
mutate(number_of_measures_whitin_the_scheme=
as.integer(number_of_measures_whitin_the_scheme)) %>%
pattern_to_na(":")
rep1%>% glimpse
saveRDS(rep1,"./intermediate_files/report1_detailed.RDS")
rep1_unique <- rep1 %>%
filter(phase_name %in% c("Notification", "Prenotification"),
!is.na(decided)) %>%
distinct(case_reference, .keep_all=T)
rep1_unique%>% glimpse
saveRDS(rep1_unique,"./intermediate_files/report1_unique.RDS")
save_excel(rep1_unique, "./intermediate_files/data_budget.xlsx")
```
---
`r chunk_reveal(chunk_name = "report" ,
title = "### Clean and save the report data")`
---
Now a bunch of statistics and aggregations concerning the budget
---
```{r budget1, include = FALSE}
budget_test <- rep1_unique %>%
group_by(member_state_2_letter_code) %>%
summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
ungroup %>%
mutate(share=budget/sum(budget)) %>%
add_total(nrow(.)+1) %>%
filter(!is.na(member_state_2_letter_code))
budget_test%>% glimpse
saveRDS(budget_test, "./intermediate_files/budget_MS.RDS")
```
---
`r chunk_reveal(chunk_name = "budget1" ,
title = "### Budget by MS and save the result")`
---
Budget for a selection of years in the vector year_focus
---
```{r budget2, include = FALSE}
budget_test_focus <- rep1_unique %>%
mutate(decision_date=excel_numeric_to_date(decision_date)) %>%
mutate(year=year(decision_date)) %>%
filter(year %in% year_focus) %>%
summarise(budget=sum(confirmed_budgets, na.rm=T))
budget_test_focus%>% glimpse
```
---
`r chunk_reveal(chunk_name = "budget2" ,
title = "### Total Budget by for selected years")`
---
```{r budget3, include = FALSE}
budget_type <- rep1_unique %>%
group_by(type_of_aid) %>%
summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
ungroup %>%
filter(complete.cases(.)) %>%
mutate(share=budget/sum(budget)) %>%
mutate(name=type_of_aid) %>%
arrange(desc(budget))%>%
add_total(nrow(.)+1)
budget_type%>% glimpse
budget_type_ms <- rep1_unique %>%
group_by( member_state_2_letter_code, type_of_aid) %>%
summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
na.omit %>%
mutate(share=budget/sum(budget, na.rm=T)) %>%
group_modify(~ .x %>%
adorn_totals("row")) %>%
ungroup %>%
complete(member_state_2_letter_code, type_of_aid) %>%
na_to_pattern(0) %>%
mutate(name=type_of_aid) %>%
na_to_pattern("Total") %>%
arrange(member_state_2_letter_code,desc(budget))
budget_type_ms%>% glimpse
```
---
`r chunk_reveal(chunk_name = "budget3" ,
title = "### Budget by tipe of aid and type of aid and MS")`
---
Budget by type of measure and measure and MS
---
```{r budget4, include = FALSE}
budget_measure <- rep1_unique %>%
group_by(type_of_measure) %>%
summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
ungroup %>%
filter(complete.cases(.)) %>%
mutate(share=budget/sum(budget))%>%
add_total(nrow(.)+1)
budget_measure%>% glimpse
budget_measure_ms <- rep1_unique %>%
group_by( member_state_2_letter_code,
type_of_measure) %>%
summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
na.omit %>%
mutate(share=budget/sum(budget))%>%
group_modify(~ .x %>%
adorn_totals("row")) %>%
ungroup %>%
complete(member_state_2_letter_code,
type_of_measure) %>%
na_to_pattern(0)%>%
filter(type_of_measure !=" ", type_of_measure!=0)
budget_measure_ms%>% glimpse
```
---
`r chunk_reveal(chunk_name = "budget4" ,
title = "### Budget by tipe of measure and type of measure and MS")`
---
```{r budget5, include = FALSE}
budget_measure_case_ms <- rep1_unique %>%
group_by( member_state_2_letter_code,case_reference,
type_of_measure) %>%
summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
na.omit %>%
mutate(share=budget/sum(budget))%>%
ungroup %>%
complete(member_state_2_letter_code,
type_of_measure) %>%
na_num_to_pattern(0) %>%
na_char_to_pattern("Missing")%>%
filter(type_of_measure!=" ", type_of_measure!=0)
budget_measure_case_ms%>% glimpse
save_excel(budget_measure_case_ms, "./intermediate_files/budget_measure_case.xlsx")
```
---
`r chunk_reveal(chunk_name = "budget5" ,
title = "### Detailed breakdown of the budget and save")`
---
```{r budget6, include = FALSE}
budget_by_beneficiary_sector <- rep1_unique %>%
group_by(beneficary_sector) %>%
summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
ungroup %>%
filter(complete.cases(.)) %>%
mutate(share=budget/sum(budget)) %>%
arrange(desc(budget))%>%
add_total(nrow(.)+1) %>%
filter(beneficary_sector!=" ", beneficary_sector!=0)
budget_by_beneficiary_sector%>% glimpse
budget_by_beneficiary_sector_ms <- rep1_unique %>%
group_by(member_state_2_letter_code,beneficary_sector) %>%
summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
na.omit %>%
mutate(share=budget/sum(budget)) %>%
group_modify(~ .x %>%adorn_totals("row")) %>%
ungroup%>%
complete(member_state_2_letter_code,beneficary_sector) %>%
na_to_pattern(0) %>%
filter(beneficary_sector!=" ", beneficary_sector!=0)
budget_by_beneficiary_sector_ms%>% glimpse
saveRDS(budget_by_beneficiary_sector_ms, "./intermediate_files/budget_sector_MS.RDS")
```
---
`r chunk_reveal(chunk_name = "budget6" ,
title = "### Budget by beneficiary sector and by beneficiary sector and MS")`
---
```{r budget7, include = FALSE}
budget_by_beneficiary_type <- rep1_unique %>%
group_by(beneficary_type) %>%
summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
ungroup %>%
filter(complete.cases(.)) %>%
mutate(share=budget/sum(budget)) %>%
arrange(desc(budget))%>%
add_total(nrow(.)+1)%>%
filter(beneficary_type!=" ", beneficary_type!=0)
budget_by_beneficiary_type%>% glimpse
budget_by_beneficiary_type_ms <- rep1_unique %>%
group_by(member_state_2_letter_code,beneficary_type) %>%
summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
na.omit %>%
mutate(share=budget/sum(budget)) %>%
group_modify(~ .x %>%
adorn_totals("row")) %>%
ungroup%>%
complete(member_state_2_letter_code,beneficary_type) %>%
na_to_pattern(0)%>%
filter(beneficary_type!=" ", beneficary_type!=0)
budget_by_beneficiary_type_ms%>% glimpse
saveRDS(budget_by_beneficiary_type_ms, "./intermediate_files/budget_type_MS.RDS")
```
---
`r chunk_reveal(chunk_name = "budget7" ,
title = "### Budget by beneficiary type and by beneficiary type and MS")`
---
```{r budget8, include = FALSE}
decisions_test <- rep1_unique %>%
filter(decided=="YES")%>%
group_by(member_state_2_letter_code) %>%
summarise(n_decisions=n()) %>%
ungroup() %>%
arrange(member_state_2_letter_code)%>%
add_total(nrow(.)+1)
decisions_test%>% glimpse
undecided_count <- rep1_unique %>%
filter(!is.na(legal_basis),
decided=="NO")
undecided_count%>% glimpse
save_excel(undecided_count, "./intermediate_files/undecided_cases.xlsx")
```
---
`r chunk_reveal(chunk_name = "budget8" ,
title = "### Statistics on the decisions by MS and save")`
---
```{r decisions1, include = FALSE}
decisions_eu <- rep1_unique %>%
filter(decided=="YES")%>%
group_by(legal_basis) %>%
summarise(n_decisions=n()) %>%
ungroup() %>%
arrange(desc(n_decisions)) %>%
mutate(share=n_decisions/sum(n_decisions))%>%
add_total(nrow(.)+1)
decisions_eu %>% glimpse
decisions_eu_ms <- rep1_unique %>%
filter(decided=="YES")%>%
group_by( member_state_2_letter_code,legal_basis) %>%
summarise(n_decisions=n()) %>%
na.omit %>%
mutate(share=n_decisions/sum(n_decisions))%>%
group_modify(~ .x %>%
adorn_totals("row")) %>%
ungroup() %>%
complete(member_state_2_letter_code,legal_basis) %>%
na_to_pattern(0) %>%
filter(legal_basis!="0")
decisions_eu_ms %>% glimpse
saveRDS(decisions_eu_ms, "./intermediate_files/decisions_MS.RDS")
```
---
`r chunk_reveal(chunk_name = "decisions1" ,
title = "### Number decisions by MS and legal basis and save")`
---
```{r decisions2, include = FALSE}
decisions_eu_ms_time <- rep1_unique %>%
filter(decided=="YES") %>%
mutate(decision_date=excel_numeric_to_date(decision_date)) %>%
select(case_reference,member_state_2_letter_code,
decision_date,confirmed_budgets) %>%
mutate(year=year(decision_date),
month=month(decision_date)) %>%
group_by(member_state_2_letter_code, year, month) %>%
summarise(n_decisions=n(),
budget=sum(confirmed_budgets)) %>%
ungroup %>%
mutate(date=paste(year, "-", month, sep="")) %>%
mutate(date=ym(date)) %>%
arrange(member_state_2_letter_code,date) %>%
mutate(month_name=month.name[.$month]) %>%
group_by(member_state_2_letter_code) %>%
mutate(share_dec=n_decisions/sum(n_decisions)*100,
share_budget=budget/sum(budget)*100) %>%
mutate(share_dec=round_preserve_sum(share_dec,1),
share_budget=round_preserve_sum(share_budget,1)) %>%
ungroup %>%
mutate(date_easy=paste(month_name, year, sep=" "))
decisions_eu_ms_time %>% glimpse
saveRDS(decisions_eu_ms_time, "./intermediate_files/decisions_budget_time.RDS")
```
---
`r chunk_reveal(chunk_name = "decisions2" ,
title = "### Number decisions by MS along time and save")`
---
```{r amendments1, include = FALSE}
count_am <- df_am %>%
group_by(member_state_of_amendment) %>%
summarise(n_amendments=length(unique(amendment_case_number))) %>%
ungroup%>%
add_total(nrow(.)+1)
count_am %>% glimpse
saveRDS(count_am, "./intermediate_files/number_amendments.RDS")
```
---
`r chunk_reveal(chunk_name = "amendments1" ,
title = "### Number amendments by MS and save")`