# R/Pharma 2025 Workshop: datasetjson in R
# Exercises for hands-on practice with the datasetjson R package
#
# INSTRUCTIONS:
# 1. Work through each exercise section in order
# 2. Replace TODO comments with actual R code
# 3. Run your code to test it works
# 4. Check your answers against exercises/answers/01-r-answer.R
# 5. Ask for help if you get stuck!
# Load required libraries
library(datasetjson)
library(dplyr)
library(arrow) # For reading parquet files
# Set working directory to the workshop folder
# Adjust this path as needed for your setup
if (!file.exists("data")) {
warning("Data directory not found. Make sure you're in the workshop root directory.")
}
# =============================================================================
# THE WHOLE GAME: Complete Workflow with ADAE Data
# =============================================================================
# Now it's your turn! Repeat the same workflow we saw with ADSL, but using ADAE data.
# WG.1 Load the ADAE datasets (format doesn't matter - RDS or Parquet!)
# TODO: Load ADAE data using either read_parquet() or readRDS()
# Files: "data/adam/adae.parquet" or "data/adam/adae.rds"
# YOUR CODE HERE:
# WG.2 Load the metadata
# TODO: Load ADAE metadata from "data/adam/metadata/adae_meta.parquet" or .rds
# YOUR CODE HERE:
# WG.3 Examine what we have
# TODO: Look at the structure of adae and adae_meta
# Check some variable labels and data types
# Look for time variables - what class are they?
# YOUR CODE HERE:
# WG.4 Create Dataset-JSON object
# TODO: Use dataset_json() to combine the data and metadata
# HINT: Only data, name, dataset_label and columns are required.
# YOUR CODE HERE:
# WG.5 Write to standardized file
# TODO: Use write_dataset_json() to save as "ADAE.json"
# YOUR CODE HERE:
# WG.6 Read it back and verify
# TODO: Use read_dataset_json() to read the file back
# Check that labels are preserved and data is identical
# Pay attention to any time variables - do their classes change?
# Pay attention to any doubles - do their decimal precisions change? If so, consider additional arguments for write and read to improve precision.
# YOUR CODE HERE:
# WG.7 Compare original and restored
# TODO: Use diffdf::diffdf() and/or waldo::compare() to check differences
# Hint: If variable types change, check your column/variable metadata
# YOUR CODE HERE:
print("🎉 Whole Game complete! You've mastered the Dataset-JSON workflow with ADAE.")Exercises + Solutions
Exercises
01-r.R
02-python.py
# R/Pharma 2025 Workshop: dsjconvert in Python
# Exercises for hands-on practice with the dsjconvert Python package
# EXERCISE INSTRUCTIONS:
# 1. Load the required packages need to complete the exercises
# 2. Find the main() function and work through each exercise in order
# 3. Replace TODO comments with actual Python code
# 4. Use the examples is the slides to help you, or
# 5. Check the examples in the dsjconvert README.md (https://github.com/swhume/dataset-json)
# 6. Run your code to make sure it works
# 7. Check your answers against exercises/answers/02-python-answer.py
# 8. Ask for help if you get stuck!
from pathlib import Path
from dsjconvert import XPTConverter, MetadataExtractor
from dsjconvert import DatasetJSONToXPTConverter
def check_data_dir_exists() -> None:
directory_path = Path("data")
if not directory_path.is_dir():
raise FileNotFoundError(f"The directory '{directory_path}' does not exist or is not a directory.")
def main():
check_data_dir_exists()
# =============================================================================
# SOLUTION to Dataset-JSON conversion exercises using Python and dsjconvert
# =============================================================================
# Exercise 1: convert cm.xpt to CM.ndjson using the define.xml metadata
data_path = Path(__file__).parent.joinpath("data")
xpt_file = Path(data_path).joinpath("cm.xpt")
define_file = Path(data_path).joinpath("define.xml")
# TODO use a function to convert cm.xpt to CM.ndjson using the define.xml metadata
# Exercise 2: convert vs.xpt to VS.ndjson without using the define.xml metadata
xpt_file = Path(data_path).joinpath("vs.xpt")
# TODO use a function to convert vs.xpt to VS.ndjson without using the define.xml metadata
# Exercise 3: convert dm.xpt to dm.json using the define.xml metadata
xpt_file = Path(data_path).joinpath("dm.xpt")
# TODO use a function to convert dm.xpt to dm.json using the define.xml metadata
# Exercise 4: convert MH.ndjson to mh.xpt using the define.xml metadata
dsj_file = Path(data_path).joinpath("MH.ndjson")
# TODO use a function to convert MH.ndjson to mh.xpt
if __name__ == '__main__':
main()
# Placeholder for pythonSolutions
01-r.R
Warning in readLines("answers/01-r-answer.R"): incomplete final line found on
'answers/01-r-answer.R'
# R/Pharma 2025 Workshop: datasetjson in R - ANSWER KEY
# Solutions for hands-on practice with the datasetjson R package
# Load required libraries
library(datasetjson)
library(dplyr)
library(arrow) # For reading parquet files
# Helper function for null coalescing
`%||%` <- function(x, y) if (is.null(x)) y else x
# Set working directory to the workshop folder
if (!file.exists("data")) {
warning("Data directory not found. Make sure you're in the workshop root directory.")
}
# =============================================================================
# THE WHOLE GAME: Complete Workflow with ADAE Data - SOLUTIONS
# =============================================================================
# WG.1 Load the ADAE datasets (format doesn't matter - RDS or Parquet!)
# Using parquet (could also use readRDS)
adae <- read_parquet("data/adam/adae.parquet")
# Alternative: adae <- readRDS("data/adam/adae.rds")
# WG.2 Load the metadata
adae_meta <- read_parquet("data/adam/metadata/adae_meta.parquet")
# Alternative: adae_meta <- readRDS("data/adam/metadata/adae_meta.rds")
# WG.3 Examine what we have
str(adae) # Adverse events data
head(adae_meta) # Metadata in Dataset-JSON format
str(attributes(adae))
# Look for time variables and their classes
time_vars <- adae_meta[adae_meta$dataType == "time", ]
if(nrow(time_vars) > 0) {
print("Time variables found:")
print(time_vars$name)
# Check original classes
for(var in time_vars$name) {
if(var %in% names(adae)) {
cat(var, ":", class(adae[[var]]), "\n")
}
}
}
# WG.4 Create Dataset-JSON object
adae_json <- dataset_json(
adae,
name = "ADAE",
dataset_label = "Adverse Events Analysis Dataset",
columns = adae_meta
)
# WG.5 Write to standardized file
write_dataset_json(adae_json, "ADAE_answer.json")
# WG.6 Read it back and verify
adae_restored <- read_dataset_json("ADAE_answer.json")
# Check attributes are preserved
str(attributes(adae_restored))
# Check time variable classes after round-trip
if(nrow(time_vars) > 0) {
print("Time variable classes after round-trip:")
for(var in time_vars$name) {
if(var %in% names(adae_restored)) {
cat(var, ":", class(adae_restored[[var]]), "\n")
}
}
}
# WG.7 Compare original and restored
diffdf::diffdf(adae, adae_restored)
waldo::compare(adae, adae_restored)
# differences
# 1. df class is now datasetjson_v1_1_0, datasetjson, data.frame - this is ok and expected!
# 2. ASEVN was numeric and now is character !?! - check the adae_meta.rds. There is an error in dataType. It was set to string when it should be integer.
adae_meta$dataType[adae_meta$name == "ASEVN"] <- "integer"
adae_json <- dataset_json(
adae,
name = "ADAE",
dataset_label = "Adverse Events Analysis Dataset",
columns = adae_meta
)
write_dataset_json(adae_json, "ADAE_answer.json")
adae_restored <- read_dataset_json("ADAE_answer.json")
diffdf::diffdf(adae, adae_restored)
print("🎉 Whole Game complete! You've mastered the Dataset-JSON workflow with ADAE.")
print("Notice how time variables may change class during round-trip!")02-python.py
# R/Pharma 2025 Workshop: dsjconvert in Python
# Solution to the exercises for hands-on practice with the dsjconvert Python package
# =============================================================================
# This is the solution to the Python dsjconvert exercises.
# =============================================================================
# EXERCISE INSTRUCTIONS:
# 1. Load the required packages need to complete the exercises
# 2. Find the main() function and work through each exercise in order
# 3. Replace TODO comments with actual Python code
# 4. Use the examples is the slides to help you, or
# 5. Check the examples in the dsjconvert README.md (https://github.com/swhume/dataset-json)
# 6. Run your code to make sure it works
# 7. Check your answers against exercises/answers/02-python-answer.py
# 8. Ask for help if you get stuck!
from pathlib import Path
from dsjconvert import XPTConverter, MetadataExtractor
from dsjconvert import DatasetJSONToXPTConverter
from os import getcwd
def convert_xpt_dataset(dsj_format: str, data_path: Path, xpt_file: Path, define_file: Path = None):
if define_file:
extractor = MetadataExtractor(define_file)
converter = XPTConverter(
metadata_extractor=extractor,
output_format=dsj_format,
skip_validation=True
)
else:
converter = XPTConverter(
output_format=dsj_format,
skip_validation=True
)
converter.convert_dataset(xpt_file, data_path)
def convert_dsj_dataset(dsj_format: str, data_path: Path, dsj_file: Path):
converter = DatasetJSONToXPTConverter(input_format=dsj_format, skip_validation=True)
converter.convert_dataset(dsj_file, data_path)
def check_data_dir_exists() -> None:
directory_path = Path("data")
if not directory_path.is_dir():
raise FileNotFoundError(f"The directory '{directory_path}' does not exist or is not a directory.")
def main():
check_data_dir_exists()
# =============================================================================
# SOLUTION to Dataset-JSON conversion exercises using Python and dsjconvert
# =============================================================================
# Exercise 1: convert cm.xpt to CM.ndjson using the define.xml metadata
data_path = Path(getcwd()).joinpath("data")
xpt_file = Path(data_path).joinpath("cm.xpt")
define_file = Path(data_path).joinpath("define.xml")
# TODO use a function to convert cm.xpt to CM.ndjson using the define.xml metadata
convert_xpt_dataset('ndjson', data_path, xpt_file, define_file)
# Exercise 2: convert vs.xpt to VS.ndjson without using the define.xml metadata
xpt_file = Path(data_path).joinpath("vs.xpt")
# TODO use a function to convert vs.xpt to VS.ndjson without using the define.xml metadata
convert_xpt_dataset('ndjson', data_path, xpt_file)
# Exercise 3: convert dm.xpt to dm.json using the define.xml metadata
xpt_file = Path(data_path).joinpath("dm.xpt")
# TODO use a function to convert dm.xpt to dm.json using the define.xml metadata
convert_xpt_dataset('json', data_path, xpt_file, define_file)
# Exercise 4: convert MH.ndjson to mh.xpt using the define.xml metadata
dsj_file = Path(data_path).joinpath("MH.ndjson")
# TODO use a function to convert MH.ndjson to mh.xpt
convert_dsj_dataset("ndjson", data_path, dsj_file)
if __name__ == '__main__':
main()
# Placeholder for answers