This section converts the INDSTAT4 datasets into the ISIC 2 classification scheme at the 4-digit code level.
Rscript code/get_concordance_tables.R
:
.Rds
of original as-downloaded tables can be found in data/external/concordances
2020-05-15
tables are loaded from data/interim/
# import concordance tables
concord_tidy <- read_rds(here("data/interim/002-isicTables_2020-05-15.Rds"))
ISIC3_2_dupes <- show_duplicates(concord_tidy$ISIC3_2, ISIC3, ISIC2)
knitr::kable(ISIC3_2_dupes)
ISIC3 | needs_split_ISIC3 | ISIC2 | needs_split_ISIC2 | Detail | dupe |
---|---|---|---|---|---|
9249 | 1 | 9414 | 1 | Recording or taping of sound | TRUE |
9249 | 1 | 9414 | 1 | Casting or booking agencyactivities | TRUE |
partialISICx
columns indicates if the ISICx
code is split across multiple ISICy
codes
0
indicates that the ISICx
code has one-to-one mapping to ISICy
codes.ISICy
code might have a one-to-many relationship with ISICx
codespartialISICx
columns are renamed needs_split_ISICx
, where 0 indicates no split is needs, and 1 indicates a splitting of ISICx
values needs to be performedThe correspondence table shows links between the classes (4-digit level) in ISIC Rev 3.1 and ISIC Rev 3. The two fields
partial31
andpartial3
indicate whether the complete code (0) or only a part of it (1) is involved in a particular link. Descriptions of parts involved are provided.
ISIC3-ISIC2.txt
ISIC4_ISIC31.txt
ISIC_Rev_2-ISIC_Rev_3_1_correspondence.txt
ISIC4_ISIC31.txt
ISIC_Rev_31-ISIC_Rev_3_correspondence.txt
ISIC3-ISIC2.txt
# TODO: add plots of 3step & 4step data -- highlighting differences
## ---- define one transformation step ----
transform_later_into_earlier <- function(lookup_renamed, later_df, later_col, earlier_col, needs_split_col){
# quosures
later_col <- enquo(later_col)
earlier_col <- enquo(earlier_col)
needs_split_col <- enquo(needs_split_col)
# transformation steps
make_transformation_table(lookup_renamed, !!later_col, !!earlier_col, !!needs_split_col) %>%
use_transformation_table(., later_df, !!later_col, !!earlier_col)
}
## ---- apply transformation steps ----
# list for transformed data
inISIC2.4digit <- list()
# REV3
inISIC2.4digit$REV3 <- transform_later_into_earlier(concord_tidy$ISIC3_2, clean$REV3, ISIC3, ISIC2, needs_split_ISIC3)
## Test passed 🎊
# REV4
inISIC2.4digit$REV4_2step <-
transform_later_into_earlier(concord_tidy$ISIC4_31, clean$REV4, ISIC4, ISIC31, needs_split_ISIC4) %>%
transform_later_into_earlier(concord_tidy$ISIC31_2, ., ISIC31, ISIC2, needs_split_ISIC31)
## Test passed 🥳
## Test passed 🥳
inISIC2.4digit$REV4_3step <-
transform_later_into_earlier(concord_tidy$ISIC4_31, clean$REV4, ISIC4, ISIC31, needs_split_ISIC4) %>%
transform_later_into_earlier(concord_tidy$ISIC31_3, ., ISIC31, ISIC3, needs_split_ISIC31) %>%
transform_later_into_earlier(concord_tidy$ISIC3_2, ., ISIC3, ISIC2, needs_split_ISIC3)
## Test passed 🥇
## Test passed 🌈
## Test passed 🎊
## ---- summarise from 4digit to 3digit codes ----
inISIC2.3digit <- map(inISIC2.4digit, summarise_to_3digit)
cache_3digit <- here("data/interim/", paste0("002-INDSTAT_ISIC2_3digit_", lubridate::today(), ".Rds"))
write_rds(inISIC2.3digit, cache_3digit)
# packages needed to run these functions
require(testthat)
require(tidyverse)
# -- PROBING FUNCTIONS --
show_duplicates <- function(df, ...){
#' find duplicate rows based on ... cols
#' @param ... cols to identify duplication on
df %>%
group_by(...) %>%
mutate(dupe = n() > 1) %>%
filter(dupe == TRUE)
}
count_uniq_codes_in_REV <- function(lookup_renamed){
pivot_longer(lookup_renamed,
starts_with("ISIC"),
names_to = "REV",
values_to = "code") %>%
select(REV, code) %>%
distinct() %>%
count(REV, name = "n_uniq_codes")
}
# -- FUNCTIONS FOR CONCORDANCE --
remove_duplicates <- function(df, ..., keep_all_cols = FALSE){
#' remove duplicate rows from table based on ... cols
#' @param ... cols to identify duplication on
distinct(df, ..., .keep_all = keep_all_cols)
}
add_split_flags <- function(lookup_noDupes, later_col, earlier_col, needs_split_col){
#' create concordance table from later to earlier codes, with split denominator
#' @param lookup_noDupes concordance table back to earlier standard
#' @param later_col column name of later code -- e.g. ISIC3
#' @param earlier_col column name of later code -- e.g. ISIC2
# capture column name expressions
later_col <- enquo(later_col)
earlier_col <- enquo(earlier_col)
needs_split_col <- enquo(needs_split_col)
# see if group has any rows with non-zero needs_split/partial indicator
flags_by_later <-
lookup_noDupes %>%
group_by(!!later_col) %>%
summarise(earlier_matches = n_distinct(!!earlier_col),
group_rows = n(),
needs_split = ifelse(expr(sum(!!needs_split_col)) == 0, FALSE, TRUE),
.groups = "drop_last")
# verify grouping rows matches number of distinct codes
## wouldn't match if there was a duplicate later-earlier row
## i.e. -- distinct = 1, but group_rows ==2
test_that("rows in later group matches number of distinct earlier codes in group",
{
expect_equal(flags_by_later$group_rows, flags_by_later$earlier_matches)
})
#return(flags_by_later)
lookup_w_flags <-
left_join(x = (select(lookup_noDupes, !!later_col, !!earlier_col)),
y = (select(flags_by_later, -group_rows)),
by = quo_name(later_col))
return(lookup_w_flags)
}
make_transformation_table <- function(lookup_renamed, later_col, earlier_col, needs_split_col){
later_col <- enquo(later_col)
earlier_col <- enquo(earlier_col)
needs_split_col <- enquo(needs_split_col)
lookup_renamed %>%
remove_duplicates(., !!later_col, !!earlier_col, keep_all_cols = TRUE) %>%
add_split_flags(., !!later_col, !!earlier_col, !!needs_split_col)
}
use_transformation_table <- function(lookup_w_flags, later_df, later_col, earlier_col){
#' Use concordance table to adjust later data back to earlier code revision
#' @param lookup_w_flags concordance table from later to earlier
#' @param later_df data in later isic standard
#' @param later_col column name of later code -- e.g. ISIC3
#' @param earlier_col column name of later code -- e.g. ISIC2
later_col <- enquo(later_col)
earlier_col <- enquo(earlier_col)
# merge earlier codes into full dataset
earlier_adj <-
left_join(x = later_df,
y = lookup_w_flags,
by = quo_name(later_col)) %>%
## calculate later value contribution to earlier group
mutate(contribution.earlier = value.4 / earlier_matches) %>%
## add up values belonging to the same earlier code, from different later codes
group_by(country, year, !!earlier_col) %>%
summarise(value.4 = sum(contribution.earlier), .groups = "drop_last") %>%
ungroup()
return(earlier_adj)
# verify that concorded table has fewer rows -- since there are fewer codes
test_that("concorded 'earlier' table has expected bias compared to rows in later 'source' table",
{
### function for use in test
count_uniq_codes_used <- function(df_w_code_col, code_col){
code_col <- enquo(code_col)
df_w_code_col %>%
distinct(!!code_col) %>%
nrow()
}
## bias calc based on codes actually seen in dataset
n_later_codes_in_df <- later_df %>% distinct(!!later_col) %>% nrow()
n_earlier_codes_in_adj <- earlier_adj %>% distinct(!!earlier_col) %>% nrow()
bias_earlier_codes_used <- sign(n_earlier_codes_in_adj - n_later_codes_in_df)
## check table sizes match expectation
### i.e. fewer codes used --> fewer rows for same number of country, years
nrow_later_df <- nrow(later_df)
nrow_concorded <- nrow(earlier_adj)
print(nrow_later_df)
print(nrow_concorded)
bias_earlier_table <- sign(nrow_concorded - nrow_later_df)
expect_equal(bias_earlier_table, bias_earlier_codes_used)
})
return(earlier_adj)
}
# -- FUNCTIONS FOR SUMMARISING TO 3 DIGIT LEVEL --
summarise_to_3digit <- function(ISIC2_df){
ISIC2_df %>%
mutate(ISIC2.3digit = str_sub(ISIC2, start = 1, end = 3)) %>%
group_by(country, year, ISIC2.3digit) %>%
summarise(value.3 = sum(value.4), .groups = "drop_last") %>%
ungroup() %>%
arrange(country, year, ISIC2.3digit)
}
code/get_concordance_tables.R
:
#!/usr/bin/env Rscript
# ---- get concordance tables ----
require(tidyverse)
require(here)
require(testthat)
# get tables using read_csv
table_urls <- c("https://unstats.un.org/unsd/classifications/Econ/tables/ISIC/ISIC3_ISIC2/ISIC3-ISIC2.txt",
"https://unstats.un.org/unsd/classifications/Econ/tables/ISIC/ISIC4_ISIC31/ISIC4_ISIC31.txt",
"https://unstats.un.org/unsd/classifications/Econ/tables/ISIC/ISIC31_ISIC3/ISIC_Rev_31-ISIC_Rev_3_correspondence.txt",
"https://unstats.un.org/unsd/classifications/Econ/tables/ISIC/ISIC2_ISIC31/ISIC_Rev_2-ISIC_Rev_3_1_correspondence.txt")
tables <- map(table_urls, read_csv)
# name tables in list
table_names <- c("ISIC3_2", "ISIC4_31", "ISIC31_3", "ISIC31_2")
names(tables) <- table_names
# cache originals
cache_original <- here("data/external/concordances", paste0("isicTables_",lubridate::today(),".Rds"))
write_rds(tables, cache_original, compress = "gz")
# ---- tidy concordance tables ----
## Format column names consistently
change_colnames <- function(df){
extract_rev_num <- function(df){str_extract(names(df), "[0-9]+")}
first_4_cols <- paste0(c("ISIC", "needs_split_ISIC"), extract_rev_num(df)[1:4])
last_col_name <- "Detail"
all_cols <- c(first_4_cols, last_col_name, recursive = TRUE)
names(df) <- all_cols
return(df)
}
renamed <- map(tables, change_colnames)
## ISIC31_2 : Rearrange cols/rows by Rev31 rather than Rev2
renamed$ISIC31_2 %<>%
select(ends_with("31"), ends_with("2"), everything()) %>%
arrange(ISIC31)
# cache renamed tables
cache_rename <- here("data/interim", paste0("002-isicTables_",lubridate::today(),".Rds"))
write_rds(renamed, cache_rename)
# check data saved correctly
reload <- read_rds(cache_rename)
testthat::test_that("Saved data is the same as in-memory data", {
identical(reload, renamed)
})
concord_raw <- read_rds(here("data/external/concordances/isicTables_2020-05-15.Rds"))
map(concord_raw, head)
## $ISIC3_2
## # A tibble: 6 × 5
## ISIC3 partialISIC3 ISIC2 partialISIC2 Detail
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 0111 0 1110 1 Growing of cereals and other crops n.e.c
## 2 0112 1 1110 1 Growing of vegetables, horticultural specialities, nursery products
## 3 0112 1 1210 1 Gathering of mushrooms, truffles
## 4 0113 0 1110 1 Growing of fruit, nuts, beverage and spice crops
## 5 0121 0 1110 1 Farming of cattle, sheep, goats, horses, asses, mules and hinnies; dairy farming
## 6 0122 1 1110 1 Raising domesticated or wild animals n.e.c. (e.g. swine, poultry, rabbits)
##
## $ISIC4_31
## # A tibble: 6 × 5
## ISIC4code partialISIC4 ISIC31code partialISIC31 Detail
## <chr> <dbl> <chr> <dbl> <chr>
## 1 0111 0 0111 1 <NA>
## 2 0112 0 0111 1 <NA>
## 3 0113 1 0111 1 Growing of sugar beet, roots and tubers
## 4 0113 1 0112 1 Production of vegetable seeds, mushrooms and truffles; leafy or stem veg…
## 5 0114 0 0111 1 <NA>
## 6 0115 0 0111 1 <NA>
##
## $ISIC31_3
## # A tibble: 6 × 5
## Rev31 partial31 Rev3 partial3 Activity
## <chr> <dbl> <chr> <dbl> <chr>
## 1 0111 0 0111 0 <NA>
## 2 0112 0 0112 1 Growing of vegetables, horticultural specialties and nursery products, except olives
## 3 0113 1 0112 1 Growing of olives
## 4 0113 1 0113 0 Growing of fruit, nuts, beverage and spice crops
## 5 0113 1 0200 1 Gathering of berries or nuts
## 6 0121 0 0121 0 <NA>
##
## $ISIC31_2
## # A tibble: 6 × 5
## Rev2 partial2 Rev31 partial31 Activity
## <chr> <dbl> <chr> <dbl> <chr>
## 1 1110 1 0111 0 Growing of cereals and other crops n.e.c
## 2 1110 1 0112 1 Growing of vegetables, horticultural specialities, nursery products
## 3 1110 1 0113 0 Growing of fruit, nuts, beverage and spice crops
## 4 1110 1 0121 0 Farming of cattle, sheep, goats, horses, asses, mules and hinnies; dairy farming
## 5 1110 1 0122 1 Raising domesticated or wild animals n.e.c. (e.g. swine, poultry, rabbits)
## 6 1110 1 0130 0 Growing of crops combined with farming of animals (mixed farming)