|
| 1 | +""" |
| 2 | +Demo script for survival prediction preprocessing using SUPPORT2 dataset. |
| 3 | +
|
| 4 | +This example demonstrates how to: |
| 5 | +1. Load the SUPPORT2 dataset (using test data with 3 patients) |
| 6 | +2. Apply the preprocessing task to extract features and labels |
| 7 | +3. Examine preprocessed samples ready for model training |
| 8 | +
|
| 9 | +The preprocessing task extracts: |
| 10 | +- Features from raw patient data (demographics, vitals, labs, scores, etc.) |
| 11 | +- Ground truth survival probabilities from the dataset (surv2m/surv6m fields) |
| 12 | +- Structures data into samples ready for training a prediction model |
| 13 | +
|
| 14 | +Note: The survival probabilities shown are ground truth labels extracted from the |
| 15 | +dataset (surv2m/surv6m columns). These are the target variables that a model |
| 16 | +would learn to predict from the extracted features. |
| 17 | +
|
| 18 | +This example uses the synthetic test dataset from test-resources (3 patients). |
| 19 | +For real usage, replace the path with your actual SUPPORT2 dataset. |
| 20 | +""" |
| 21 | + |
| 22 | +import warnings |
| 23 | +import logging |
| 24 | +from pathlib import Path |
| 25 | + |
| 26 | +# Suppress warnings and reduce logging verbosity |
| 27 | +warnings.filterwarnings("ignore") |
| 28 | +logging.basicConfig(level=logging.WARNING) |
| 29 | +logging.getLogger("pyhealth").setLevel(logging.WARNING) |
| 30 | +logging.getLogger("pyhealth.datasets").setLevel(logging.WARNING) |
| 31 | +logging.getLogger("pyhealth.datasets.support2").setLevel(logging.WARNING) |
| 32 | +logging.getLogger("pyhealth.datasets.base_dataset").setLevel(logging.WARNING) |
| 33 | + |
| 34 | +# Import pyhealth modules |
| 35 | +from pyhealth.datasets import Support2Dataset |
| 36 | +from pyhealth.tasks import SurvivalPreprocessSupport2 |
| 37 | + |
| 38 | +# Suppress tqdm progress bars for cleaner output |
| 39 | +try: |
| 40 | + def noop_tqdm(iterable, *args, **kwargs): |
| 41 | + return iterable |
| 42 | + from pyhealth.datasets import base_dataset, sample_dataset |
| 43 | + base_dataset.tqdm = noop_tqdm |
| 44 | + sample_dataset.tqdm = noop_tqdm |
| 45 | + import tqdm |
| 46 | + tqdm.tqdm = noop_tqdm |
| 47 | +except (ImportError, AttributeError): |
| 48 | + pass |
| 49 | + |
| 50 | +# Step 1: Load dataset using test data |
| 51 | +print("=" * 70) |
| 52 | +print("Step 1: Load SUPPORT2 Dataset") |
| 53 | +print("=" * 70) |
| 54 | +script_dir = Path(__file__).parent |
| 55 | +test_data_path = script_dir.parent / "test-resources" / "core" / "support2" |
| 56 | + |
| 57 | +dataset = Support2Dataset( |
| 58 | + root=str(test_data_path), |
| 59 | + tables=["support2"], |
| 60 | +) |
| 61 | + |
| 62 | +print(f"Loaded dataset with {len(dataset.unique_patient_ids)} patients\n") |
| 63 | + |
| 64 | +# Step 2: Apply preprocessing task to extract features and labels (2-month horizon) |
| 65 | +print("=" * 70) |
| 66 | +print("Step 2: Apply Survival Preprocessing Task") |
| 67 | +print("=" * 70) |
| 68 | +task = SurvivalPreprocessSupport2(time_horizon="2m") |
| 69 | +sample_dataset = dataset.set_task(task=task) |
| 70 | + |
| 71 | +print(f"Generated {len(sample_dataset)} samples") |
| 72 | +print(f"Input schema: {sample_dataset.input_schema}") |
| 73 | +print(f"Output schema: {sample_dataset.output_schema}\n") |
| 74 | + |
| 75 | +# Helper function to decode tensor indices to feature strings |
| 76 | +def decode_features(tensor, processor): |
| 77 | + """Decode tensor indices back to original feature strings.""" |
| 78 | + if processor is None or not hasattr(processor, 'code_vocab'): |
| 79 | + return [str(idx.item()) for idx in tensor] |
| 80 | + reverse_vocab = {idx: token for token, idx in processor.code_vocab.items()} |
| 81 | + return [reverse_vocab.get(idx.item(), f"<unk:{idx.item()}>") for idx in tensor] |
| 82 | + |
| 83 | +# Step 3: Display features for all samples |
| 84 | +print("=" * 70) |
| 85 | +print("Step 3: Examine Preprocessed Samples") |
| 86 | +print("=" * 70) |
| 87 | +# Sort samples by patient_id to ensure consistent order |
| 88 | +samples = sorted(sample_dataset, key=lambda x: int(x['patient_id'])) |
| 89 | +for sample in samples: |
| 90 | + # Display patient ID and tensor shapes first |
| 91 | + print(f"\nPatient {sample['patient_id']}:") |
| 92 | + print(f" Demographics tensor shape: {sample['demographics'].shape}") |
| 93 | + print(f" Disease codes tensor shape: {sample['disease_codes'].shape}") |
| 94 | + print(f" Vitals tensor shape: {sample['vitals'].shape}") |
| 95 | + print(f" Labs tensor shape: {sample['labs'].shape}") |
| 96 | + print(f" Scores tensor shape: {sample['scores'].shape}") |
| 97 | + print(f" Comorbidities tensor shape: {sample['comorbidities'].shape}") |
| 98 | + |
| 99 | + # Decode and display features for this sample |
| 100 | + demographics = decode_features( |
| 101 | + sample['demographics'], |
| 102 | + sample_dataset.input_processors.get('demographics') |
| 103 | + ) |
| 104 | + disease_codes = decode_features( |
| 105 | + sample['disease_codes'], |
| 106 | + sample_dataset.input_processors.get('disease_codes') |
| 107 | + ) |
| 108 | + vitals = decode_features( |
| 109 | + sample['vitals'], |
| 110 | + sample_dataset.input_processors.get('vitals') |
| 111 | + ) |
| 112 | + labs = decode_features( |
| 113 | + sample['labs'], |
| 114 | + sample_dataset.input_processors.get('labs') |
| 115 | + ) |
| 116 | + scores = decode_features( |
| 117 | + sample['scores'], |
| 118 | + sample_dataset.input_processors.get('scores') |
| 119 | + ) |
| 120 | + comorbidities = decode_features( |
| 121 | + sample['comorbidities'], |
| 122 | + sample_dataset.input_processors.get('comorbidities') |
| 123 | + ) |
| 124 | + |
| 125 | + # Display decoded features |
| 126 | + print(f" Demographics: {', '.join(demographics)}") |
| 127 | + print(f" Disease Codes: {', '.join(disease_codes)}") |
| 128 | + print(f" Vitals: {', '.join(vitals)}") |
| 129 | + print(f" Labs: {', '.join(labs)}") |
| 130 | + print(f" Scores: {', '.join(scores)}") |
| 131 | + print(f" Comorbidities: {', '.join(comorbidities)}") |
| 132 | + print(f" Survival Probability (2m): {sample['survival_probability'].item():.4f}") |
| 133 | + |
| 134 | +print("\n") |
| 135 | +print("=" * 70) |
| 136 | +print("Preprocessing Complete!") |
| 137 | +print("=" * 70) |
| 138 | +print("The samples are ready for model training.") |
0 commit comments