Exercises + Solutions

Exercises

01-r.R

# R/Pharma 2025 Workshop: datasetjson in R
# Exercises for hands-on practice with the datasetjson R package
#
# INSTRUCTIONS:
# 1. Work through each exercise section in order
# 2. Replace TODO comments with actual R code
# 3. Run your code to test it works
# 4. Check your answers against exercises/answers/01-r-answer.R
# 5. Ask for help if you get stuck!

# Load required libraries
library(datasetjson)
library(dplyr)
library(arrow)  # For reading parquet files

# Set working directory to the workshop folder
# Adjust this path as needed for your setup
if (!file.exists("data")) {
  warning("Data directory not found. Make sure you're in the workshop root directory.")
}

# =============================================================================
# THE WHOLE GAME: Complete Workflow with ADAE Data
# =============================================================================

# Now it's your turn! Repeat the same workflow we saw with ADSL, but using ADAE data.

# WG.1 Load the ADAE datasets (format doesn't matter - RDS or Parquet!)
# TODO: Load ADAE data using either read_parquet() or readRDS()
# Files: "data/adam/adae.parquet" or "data/adam/adae.rds"
# YOUR CODE HERE:


# WG.2 Load the metadata
# TODO: Load ADAE metadata from "data/adam/metadata/adae_meta.parquet" or .rds
# YOUR CODE HERE:


# WG.3 Examine what we have
# TODO: Look at the structure of adae and adae_meta
# Check some variable labels and data types
# Look for time variables - what class are they?
# YOUR CODE HERE:


# WG.4 Create Dataset-JSON object
# TODO: Use dataset_json() to combine the data and metadata
# HINT: Only data, name, dataset_label and columns are required.
# YOUR CODE HERE:


# WG.5 Write to standardized file
# TODO: Use write_dataset_json() to save as "ADAE.json"
# YOUR CODE HERE:


# WG.6 Read it back and verify
# TODO: Use read_dataset_json() to read the file back
# Check that labels are preserved and data is identical
# Pay attention to any time variables - do their classes change?
# Pay attention to any doubles - do their decimal precisions change?  If so, consider additional arguments for write and read to improve precision.
# YOUR CODE HERE:


# WG.7 Compare original and restored
# TODO: Use diffdf::diffdf() and/or waldo::compare() to check differences
# Hint: If variable types change, check your column/variable metadata
# YOUR CODE HERE:


print("🎉 Whole Game complete! You've mastered the Dataset-JSON workflow with ADAE.")

02-python.py

# R/Pharma 2025 Workshop: dsjconvert in Python
# Exercises for hands-on practice with the dsjconvert Python package

# EXERCISE INSTRUCTIONS:
# 1. Load the required packages need to complete the exercises
# 2. Find the main() function and work through each exercise in order
# 3. Replace TODO comments with actual Python code
# 4. Use the examples is the slides to help you, or
# 5. Check the examples in the dsjconvert README.md (https://github.com/swhume/dataset-json)
# 6. Run your code to make sure it works
# 7. Check your answers against exercises/answers/02-python-answer.py
# 8. Ask for help if you get stuck!

from pathlib import Path
from dsjconvert import XPTConverter, MetadataExtractor
from dsjconvert import DatasetJSONToXPTConverter


def check_data_dir_exists() -> None:
    directory_path = Path("data")
    if not directory_path.is_dir():
        raise FileNotFoundError(f"The directory '{directory_path}' does not exist or is not a directory.")


def main():
    check_data_dir_exists()

    # =============================================================================
    # SOLUTION to Dataset-JSON conversion exercises using Python and dsjconvert
    # =============================================================================

    # Exercise 1: convert cm.xpt to CM.ndjson using the define.xml metadata
    data_path = Path(__file__).parent.joinpath("data")
    xpt_file = Path(data_path).joinpath("cm.xpt")
    define_file = Path(data_path).joinpath("define.xml")
    # TODO use a function to convert cm.xpt to CM.ndjson using the define.xml metadata

    # Exercise 2: convert vs.xpt to VS.ndjson without using the define.xml metadata
    xpt_file = Path(data_path).joinpath("vs.xpt")
    # TODO use a function to convert vs.xpt to VS.ndjson without using the define.xml metadata

    # Exercise 3: convert dm.xpt to dm.json using the define.xml metadata
    xpt_file = Path(data_path).joinpath("dm.xpt")
    # TODO use a function to convert dm.xpt to dm.json using the define.xml metadata

    # Exercise 4: convert MH.ndjson to mh.xpt using the define.xml metadata
    dsj_file = Path(data_path).joinpath("MH.ndjson")
    # TODO use a function to convert MH.ndjson to mh.xpt


if __name__ == '__main__':
    main()
# Placeholder for python

Solutions

01-r.R

Warning in readLines("answers/01-r-answer.R"): incomplete final line found on
'answers/01-r-answer.R'
# R/Pharma 2025 Workshop: datasetjson in R - ANSWER KEY
# Solutions for hands-on practice with the datasetjson R package

# Load required libraries
library(datasetjson)
library(dplyr)
library(arrow)  # For reading parquet files

# Helper function for null coalescing
`%||%` <- function(x, y) if (is.null(x)) y else x

# Set working directory to the workshop folder
if (!file.exists("data")) {
  warning("Data directory not found. Make sure you're in the workshop root directory.")
}

# =============================================================================
# THE WHOLE GAME: Complete Workflow with ADAE Data - SOLUTIONS
# =============================================================================

# WG.1 Load the ADAE datasets (format doesn't matter - RDS or Parquet!)
# Using parquet (could also use readRDS)
adae <- read_parquet("data/adam/adae.parquet")
# Alternative: adae <- readRDS("data/adam/adae.rds")

# WG.2 Load the metadata
adae_meta <- read_parquet("data/adam/metadata/adae_meta.parquet")
# Alternative: adae_meta <- readRDS("data/adam/metadata/adae_meta.rds")

# WG.3 Examine what we have
str(adae)  # Adverse events data
head(adae_meta)  # Metadata in Dataset-JSON format
str(attributes(adae))

# Look for time variables and their classes
time_vars <- adae_meta[adae_meta$dataType == "time", ]

if(nrow(time_vars) > 0) {
  print("Time variables found:")
  print(time_vars$name)
  # Check original classes
  for(var in time_vars$name) {
    if(var %in% names(adae)) {
      cat(var, ":", class(adae[[var]]), "\n")
    }
  }
}


# WG.4 Create Dataset-JSON object
adae_json <- dataset_json(
  adae,
  name = "ADAE",
  dataset_label = "Adverse Events Analysis Dataset",
  columns = adae_meta
)

# WG.5 Write to standardized file
write_dataset_json(adae_json, "ADAE_answer.json")

# WG.6 Read it back and verify
adae_restored <- read_dataset_json("ADAE_answer.json")

# Check attributes are preserved
str(attributes(adae_restored))

# Check time variable classes after round-trip
if(nrow(time_vars) > 0) {
  print("Time variable classes after round-trip:")
  for(var in time_vars$name) {
    if(var %in% names(adae_restored)) {
      cat(var, ":", class(adae_restored[[var]]), "\n")
    }
  }
}

# WG.7 Compare original and restored
diffdf::diffdf(adae, adae_restored)
waldo::compare(adae, adae_restored)

# differences
# 1. df class is now datasetjson_v1_1_0, datasetjson, data.frame - this is ok and expected!
# 2. ASEVN was numeric and now is character !?! - check the adae_meta.rds.  There is an error in dataType.  It was set to string when it should be integer.
adae_meta$dataType[adae_meta$name == "ASEVN"] <- "integer"

adae_json <- dataset_json(
  adae,
  name = "ADAE",
  dataset_label = "Adverse Events Analysis Dataset",
  columns = adae_meta
)

write_dataset_json(adae_json, "ADAE_answer.json")

adae_restored <- read_dataset_json("ADAE_answer.json")

diffdf::diffdf(adae, adae_restored)

print("🎉 Whole Game complete! You've mastered the Dataset-JSON workflow with ADAE.")
print("Notice how time variables may change class during round-trip!")

02-python.py

# R/Pharma 2025 Workshop: dsjconvert in Python
# Solution to the exercises for hands-on practice with the dsjconvert Python package
# =============================================================================
# This is the solution to the Python dsjconvert exercises.
# =============================================================================
# EXERCISE INSTRUCTIONS:
# 1. Load the required packages need to complete the exercises
# 2. Find the main() function and work through each exercise in order
# 3. Replace TODO comments with actual Python code
# 4. Use the examples is the slides to help you, or
# 5. Check the examples in the dsjconvert README.md (https://github.com/swhume/dataset-json)
# 6. Run your code to make sure it works
# 7. Check your answers against exercises/answers/02-python-answer.py
# 8. Ask for help if you get stuck!

from pathlib import Path
from dsjconvert import XPTConverter, MetadataExtractor
from dsjconvert import DatasetJSONToXPTConverter
from os import getcwd


def convert_xpt_dataset(dsj_format: str, data_path: Path, xpt_file: Path, define_file: Path = None):
    if define_file:
        extractor = MetadataExtractor(define_file)
        converter = XPTConverter(
            metadata_extractor=extractor,
            output_format=dsj_format,
            skip_validation=True
        )
    else:
        converter = XPTConverter(
            output_format=dsj_format,
            skip_validation=True
        )
    converter.convert_dataset(xpt_file, data_path)


def convert_dsj_dataset(dsj_format: str, data_path: Path, dsj_file: Path):
    converter = DatasetJSONToXPTConverter(input_format=dsj_format, skip_validation=True)
    converter.convert_dataset(dsj_file, data_path)


def check_data_dir_exists() -> None:
    directory_path = Path("data")
    if not directory_path.is_dir():
        raise FileNotFoundError(f"The directory '{directory_path}' does not exist or is not a directory.")


def main():
    check_data_dir_exists()

    # =============================================================================
    # SOLUTION to Dataset-JSON conversion exercises using Python and dsjconvert
    # =============================================================================

    # Exercise 1: convert cm.xpt to CM.ndjson using the define.xml metadata
    data_path = Path(getcwd()).joinpath("data")
    xpt_file = Path(data_path).joinpath("cm.xpt")
    define_file = Path(data_path).joinpath("define.xml")
    # TODO use a function to convert cm.xpt to CM.ndjson using the define.xml metadata
    convert_xpt_dataset('ndjson', data_path, xpt_file, define_file)

    # Exercise 2: convert vs.xpt to VS.ndjson without using the define.xml metadata
    xpt_file = Path(data_path).joinpath("vs.xpt")
    # TODO use a function to convert vs.xpt to VS.ndjson without using the define.xml metadata
    convert_xpt_dataset('ndjson', data_path, xpt_file)

    # Exercise 3: convert dm.xpt to dm.json using the define.xml metadata
    xpt_file = Path(data_path).joinpath("dm.xpt")
    # TODO use a function to convert dm.xpt to dm.json using the define.xml metadata
    convert_xpt_dataset('json', data_path, xpt_file, define_file)

    # Exercise 4: convert MH.ndjson to mh.xpt using the define.xml metadata
    dsj_file = Path(data_path).joinpath("MH.ndjson")
    # TODO use a function to convert MH.ndjson to mh.xpt
    convert_dsj_dataset("ndjson", data_path, dsj_file)


if __name__ == '__main__':
    main()
# Placeholder for answers