This section prepares UNIDO INDSTAT4 datasets for concordance at ISIC Revision 2 (Rev 2) level.
isiccomb
), which encompass multiple ISIC codes.HELP/metadata
– but the tables are not easily machine readableisic
) 3- and 4- digit codes.# key paths
path <- list()
path$rev3.output <- here('data/raw/datasets/REV_3', 'data/14-Output.csv')
path$rev4.output <- here('data/raw/datasets/REV_4', 'data/14-Output.csv')
# import raw data
raw <- list()
raw$REV3 <- read_csv(here('data/raw/datasets/REV_3', 'data/14-Output.csv'),
col_types = cols())
raw$REV4 <- read_csv(here('data/raw/datasets/REV_4','data/14-Output.csv'),
col_types = cols())
data/raw/datasets
ISIC_datasets/
source(here("code/fncs_probing.R"))
probe_raw <- function(raw_df){
print((substitute(raw_df)))
print(paste("col_names:", paste(names(raw_df), collapse = ", ")))
print(dim(raw_df) %>% paste(., c("rows", "cols"), collapse = " x "))
print(paste("No. distinct country year obs:", Q.how_many_country_year_rows(raw_df)))
}
raw$REV3 %>% filter(country == "004", year == 2002) %>% arrange(desc(isic))
## # A tibble: 152 × 9
## ctable country year isic isiccomb value utable source unit
## <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 14 004 2002 D D 11963511 14 1 $
## 2 14 004 2002 3720 3720 NA 14 0 $
## 3 14 004 2002 3710 3710 NA 14 0 $
## 4 14 004 2002 3699 3699 48240 14 1 $
## 5 14 004 2002 3694 3694 NA 14 0 $
## 6 14 004 2002 3693 3693 NA 14 0 $
## 7 14 004 2002 3692 3692 NA 14 0 $
## 8 14 004 2002 3691 3691 NA 14 0 $
## 9 14 004 2002 369 369 NA 14 0 $
## 10 14 004 2002 3610 3610 NA 14 0 $
## # … with 142 more rows
head(raw$REV4)
## # A tibble: 6 × 9
## ctable country year isic isiccomb value utable source unit
## <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 14 008 2010 1010 1010C 44363650 11 1 $
## 2 14 008 2010 1020 1010C NA 11 1 $
## 3 14 008 2010 1030 1030 5522606 11 1 $
## 4 14 008 2010 1040 1040 31307594 11 1 $
## 5 14 008 2010 1050 1050 16385013 11 1 $
## 6 14 008 2010 106 106 72390393 11 1 $
## test each country, year has only one unique isiccomb value:
probe_raw(raw$REV4)
## raw$REV4
## [1] "col_names: ctable, country, year, isic, isiccomb, value, utable, source, unit"
## [1] "127494 rows x 9 cols"
## Test passed 🎊
## [1] "No. distinct country year obs: 787"
ctable
: table code 14
corresponds to OUTPUT
dimension of INDSTAT4 datasetsource
: 4 possible values (0
, 1
, 2
, 3
). Code descriptions unknown.unit
: $
denotes value is in USD. All observations in this dataset at in USD (not National Currency)utable
: additional note on output valuation method. Consistent within country, year
but not country
unido-metadata/utable_descriptions.pdf
)11
: output at basic prices12
: output at factor prices13
: output at producers’ prices14
: output (valuation not defined)country
: three-digit country code. UNIDO metadata shows there should be 133 distinct countries in INDSTAT4 2019, Rev 3year
: 4-digit yearisic
: 4-digit, 3-digit or section code. 3-digit group and ‘Total Manufacturing’ section values are aggregations of 4-digit class valuesisiccomb
: same as isic
unless source data does not correspond with ISIC Rev 3 codes, in which case a custom code is created by appending a letter to the relevant 3 or 4 digit code.value
: output value in USDisiccomb
values across the isic
codes in that combination. Note that the effect of assuming equal split across child codes is mitigated by the re-aggregation to the 3-digit level once concordance to ISIC Rev 2 is performed.fncs_cleaning.R
– see Reference Code belowcountry-year
isic
code indicates whether the attached value
is at the section, 3-digit or 4- digit level.isic
indicate the group that the value
is recorded for.subset_34_from_total()
to subset initial table into a list with:
.$threefour
contains all 3- & 4- digit values.$totalrows
contains only the section totalsvalue
across contained isic codesisiccomb
group or classisiccomb
group has more than one value per year, country recordisiccomb
value into constituent isic
3- and 4- digit codes: i.e. average value
across isic
codes within isiccomb
split_isiccomb()
returns country, year, isic, isiccomb, value, value.nosplit, split.isiccomb
value
is the averaged value, while value.nosplit
is the raw value columnvalue
value3
, total4
or equal)create_34_comparison()
:
keep_larger_isic()
:
The following pipelines perform the steps described above.
# load cleaning functions
source(here('code/fncs_cleaning.R'))
## Wrapper functions
# STEP 1 & 2
raw_2_isiccomb <- function(raw_df, total_letter){
# select relevant columns from raw data
raw_df %>%
select(country, year, isic, isiccomb, value) %>%
# drop rows corresponding to section totals
subset_34_from_total(., total_letter) %>%
.$threefour %>%
# split isiccomb values over std isic codes
split_isiccomb(.)
}
# STEP 3
isiccomb_2_final <- function(isiccomb_df){
comparison_df <- create_34_comparison(isiccomb_df)
final <- keep_larger_isic(comparison_df, isiccomb_df)
return(final)
}
# Apply wrapped steps to raw data
final <- list()
final$REV3 <-
raw_2_isiccomb(raw$REV3, "D") %>%
isiccomb_2_final(.)
## Test passed 🎊
## Test passed 🥳
## Test passed 🥳
## Test passed 🥳
## Test passed 🎊
## Test passed 😀
## Test passed 😸
## Test passed 😀
## Test passed 🎊
final$REV4 <-
raw_2_isiccomb(raw$REV4, "C") %>%
isiccomb_2_final(.)
## Test passed 😸
## Test passed 🎊
## Test passed 🥳
## Test passed 😸
## Test passed 🌈
## Test passed 🥳
## Test passed 😀
## Test passed 🥳
## Test passed 😀
head(final$REV3, n=20)
## # A tibble: 20 × 5
## country year isic.3 isic.4 value.4
## <chr> <dbl> <chr> <chr> <dbl>
## 1 004 2002 154 1541 1760759.
## 2 004 2002 154 1542 1760759.
## 3 004 2002 154 1543 1760759.
## 4 004 2002 154 1544 1760759.
## 5 004 2002 154 1549 1760759.
## 6 004 2002 172 1722 1616039
## 7 004 2002 202 2029 217080
## 8 004 2002 221 2211 12060
## 9 004 2002 221 2212 12060
## 10 004 2002 221 2213 12060
## 11 004 2002 221 2219 12060
## 12 004 2002 252 2520 964799
## 13 004 2002 289 2891 66330
## 14 004 2002 289 2892 66330
## 15 004 2002 289 2893 66330
## 16 004 2002 289 2899 66330
## 17 004 2002 369 3699 48240
## 18 004 2003 154 1541 598818.
## 19 004 2003 154 1542 598818.
## 20 004 2003 154 1543 598818.
## cache processed data
write_rds(final, here("data/interim/001-clean_INDSTAT.Rds"))
The following code defines all the functions used in the data processing pipeline.
# -- Cleaning Functions --
# required packages
require(tidyverse)
require(testthat)
# function: split 3/4 digit & totals into subsets
subset_34_from_total <- function(combined_df, total_letter){
#' Helper function to split 3/4 digit values & totals into subset dfs
#' @param combined_df tibble for splitting
#' @param total_letter letter for Manufactoring section as string
## store subsetted dataset
subsets <- list()
## extract 3 & 4 digit group & class data
subsets$threefour <-
combined_df %>%
filter(isic != total_letter)
## extract rows containing section totals
subsets$totalrows <-
combined_df %>%
filter(isic == total_letter) %>%
pivot_wider(., c(country, year), names_from = isic, values_from = value) %>%
rename(., total.row = all_of(total_letter))
## verify that all rows are accounted for
test_that("All rows from combined_df are accounted for in subsets",
{
nrow.combined_df <- nrow(combined_df)
nrow.subsets <- sum(sapply(subsets, nrow))
expect_true(nrow.combined_df == nrow.subsets)
})
return(subsets)
}
# function: split values between isic code in isiccomb group
split_isiccomb <- function(threefour_df){
#' Helper function to split isiccomb values across isic codes
#' @param threefour_df df with 3/4 digit values across isic & isiccomb
# make list for interim tables
interim <- list()
# extract rows with isiccomb codes
interim$isiccomb.rows <-
threefour_df %>%
filter(., str_detect(isiccomb, '[:alpha:]'))
# test that we are not losing any data through spliting
test_that("No `country,year` has more than one recorded `value` per `isiccomb` group", {
rows_w_many_values_per_isiccomb <-
interim$isiccomb.rows %>%
group_by(country, year, isiccomb) %>%
## get no of recorded (not NA) values for given `country, year, isiccomb`
summarise(n_obs = sum(!is.na(value))) %>%
filter(n_obs != 1) %>%
nrow()
expect_true(rows_w_many_values_per_isiccomb == 0)
})
# calculate average value over isiccomb group for each country, year
interim$isiccomb.avg <-
interim$isiccomb.rows %>%
# group isiccomb rows, replace na with 0 for averaging
group_by(country, year, isiccomb) %>%
mutate(value = replace_na(value,0)) %>%
# split combination value over standard isic codes in isiccomb group
summarise(avg.value = mean(value),
## checking variables
n_isic = n_distinct(isic),
n_rows = n()) %>%
mutate(row_check = (n_isic == n_rows))
# return(interim$isiccomb.avg)
## check n_isic == n_rows
test_that("isiccomb split average is calculated with correct denominator", {
expect_true(all(interim$isiccomb.avg$row_check))
})
# output processed data
final <-
left_join(threefour_df, interim$isiccomb.avg, by = c('country', 'year', 'isiccomb')) %>%
rename(value.nosplit = value) %>%
mutate(value = coalesce(avg.value, value.nosplit),
split.isiccomb = !is.na(avg.value)) %>%
select(country, year, isic, isiccomb, value, value.nosplit, split.isiccomb) # not checking variables
return(final)
}
# function: split 3 & 4 digit into subsets
subset_3_from_4 <- function(threefour_df){
#' Helper function to split values with 3 digit codes from 4 digit codes
#' @param threefour_df processed three & four digit rows for splitting
# Split 3 & 4 digit rows into list of serparate tables
subsets <-
threefour_df %>%
## create flag for length by
## extract first three digits of isic code
mutate(isic.length = str_length(isic),
isic.3 = str_sub(isic, 1, 3)) %>%
## reorder columns & perform split
select(country, year, isic.3, isic.length, isic, value, everything()) %>%
split(., .$isic.length)
# Check all rows are accounted for
test_that("All rows are accounted for when splitting 3 & 4 digit rows",
{
nrow.input <- nrow(threefour_df)
sum_nrow.subsets <- sum(sapply(subsets, nrow))
expect_true(nrow.input == sum_nrow.subsets)
})
# List for function output
final <- list()
# tidy three-digit table
final$three_df <-
subsets$`3` %>%
select(country, year, isic.3, value)
# tidy four-digit table
final$four_df <-
subsets$`4` %>%
select(country, year, isic.3, isic.4 = isic, value)
# lookup table for isic.4 in each isic.3
final$lookup_isic.3 <-
final$four_df %>%
select(isic.3, isic.4) %>%
unique()
return(final)
}
# function: create comparison of 3-digit values & matching totals from 4-digit
create_34_comparison <- function(threefour_df){
# Split 3 & 4 digit rows into list of serparate tables
subsets <-
subset_3_from_4(threefour_df)
# Prepare 3 & 4 digit values for comparison
## extract 3 digit rows
extracted.3_digit_values <-
subsets$three %>%
select(country, year, isic.3, value) %>%
rename(value.3 = value) %>%
drop_na(value.3)
## summary of 4-digit values by 3-digit group
summary.4_digit_totals <-
subsets$four %>%
select(country, year, isic.3, isic.4, value.4 = value) %>%
drop_na(value.4) %>%
group_by(country, year, isic.3) %>%
summarise(total.4 = sum(value.4),
n_obs.4 = n())
# Comparison table of 4-digit totals with 3 digit value (if any)
comparison <-
full_join(x = extracted.3_digit_values,
y = summary.4_digit_totals,
by = c('country','year', 'isic.3')) %>%
# add difference calculations & flags
mutate_at(vars(value.3, total.4), ~replace(., is.na(.), 0)) %>%
mutate(diff = value.3 - total.4,
diff.size = abs(diff),
which.larger = factor(x = sign(diff),
levels = c(-1, 0, 1),
labels = c('total4', 'equal', 'value3'))
)
return(comparison)
}
# function: keep larger value between 3/4 values
keep_larger_isic <- function(comparison_df, threefour_df){
# function to split cases
split_cases_by_which.larger <- function(comparison_df){
# split cases
cases <-
comparison_df %>%
split(., .$which.larger) #value3 or total4
# verify split cases cover all rows
test_that("3vs4 comparison table split covers all rows",
{
nrow.3vs4 <- nrow(comparison_df)
nrow.sum_cases <- sum(sapply(cases, nrow))
expect_true(nrow.3vs4 == nrow.sum_cases)
})
return(cases)
}
# function adding averaged difference
add_avgDiff_from_value3 <- function(case_df, four_df){
#' @param case_df country, year, isic.3, value rows matching case
#' @param four_df all isic.4 rows
# calculate averaging denominator (no. isic.4 codes)
n_isic4_per_isic3 <-
four_df %>%
select(isic.3, isic.4) %>%
unique() %>%
count(isic.3, name="n_isic.4")
# Average difference between 3 & 4 total over isic.4 denominator
avgDiff_btwn_34 <-
left_join(x = case_df,
y = n_isic4_per_isic3,
by = "isic.3") %>%
mutate(avgDiff = diff.size / n_isic.4) %>%
select(country, year, isic.3, avgDiff)
# Add averaged difference to existing 4-digit values
sum_value4_and_diff3 <-
left_join(x = avgDiff_btwn_34,
y = four_df,
by = c("country", "year", "isic.3")) %>%
mutate(value.old = replace_na(value, 0),
value = avgDiff + value.old)
return(sum_value4_and_diff3)
}
# function for case where total4 is larger
keep_value4_only <- function(case_df, four_df){
#' @param case_df df with country, year, isic.3 rows matching case
#' @param four_df all isic.4 rows
case_match <-
case_df %>%
select(country, year, isic.3)
# select isic.4 values matching the case rows
inner_join(x = four_df,
y = case_match,
by = c('country', 'year', 'isic.3'))
}
# wrapper for processing & tidying cases
process_3_cases <- function(comparison_df, four_df){
# define cases
larger <- split_cases_by_which.larger(comparison_df)
# apply processing to each case
processed <- list()
processed$value3 <-
add_avgDiff_from_value3(larger$value3, four_df)
processed$equal <-
keep_value4_only(larger$equal, four_df) %>%
drop_na(value)
processed$total4 <-
keep_value4_only(larger$total4, four_df) %>%
drop_na(value)
# verify that all cases have been processed
test_that("All cases have been processed",
{
n.cases <- length(larger)
n.fixed <- length(processed)
expect_equal(n.cases, n.fixed)
})
test_that("Extracted expected number of value.4 based on n_obs.4 in comparison table", {
expected_n_obs <- comparison_df %>% filter(which.larger != "value3") %>% pull(n_obs.4) %>% sum(., na.rm = TRUE)
extracted_rows <- nrow(processed$equal) + nrow(processed$total4)
expect_equal(expected_n_obs, extracted_rows)
})
return(processed)
}
# create & test final data set
four_df <- subset_3_from_4(threefour_df)[['four_df']]
fixed_cases <- process_3_cases(comparison_df, four_df)
final <-
fixed_cases %>%
bind_rows(., .id = "which.larger") %>%
select(., country, year, isic.3, isic.4, value.4 = value) %>%
arrange(country, year, isic.3, isic.4)
test_that("nrow in combined table matches nrows in processed cases",
{
nrow.combined <- nrow(final)
nrow.cases <- sum(sapply(fixed_cases, nrow))
expect_equal(nrow.combined, nrow.cases)
})
return(final)
}