Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docker/lr-mosdepth/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,6 @@ RUN curl https://sdk.cloud.google.com | bash

# activate conda environment
RUN echo "source activate lr-mosdepth" > ~/.bashrc

# copy the script
COPY coverage_stats.py /
166 changes: 166 additions & 0 deletions docker/lr-mosdepth/coverage_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import argparse
import gzip
import json
import logging

import numpy as np
import pandas as pd


def parse_arguments():
parser = argparse.ArgumentParser(description="Process coverage values.")
parser.add_argument("mosdepth_regions", type=str,
help="Mosdepth output file containing coverage values.")
parser.add_argument("--cov_col", type=int, default=4,
help="Column holding the coverage values.")
parser.add_argument("--output_prefix", type=str,
help="Prefix to use for output files.")
parser.add_argument("--round", type=int, default=2, help="Rounding precision.")
parser.add_argument("--debug", action="store_true", help="Debug mode.")
return parser.parse_args()


def calculate_summary_statistics(
df: pd.DataFrame, cov_col: int, round_precision: int
) -> dict:
"""
Calculate summary statistics for the coverage values.
@param df: Dataframe containing coverage values
@param cov_col: Column containing coverage values
@param round_precision: Rounding precision
@return:
"""

cov_data: pd.Series = df.iloc[:, cov_col - 1]
mean_val: float = cov_data.mean()
mad_val: float = np.mean(np.abs(cov_data - mean_val))

statistics = {
"cov_mean": round(mean_val, round_precision),
"cov_q1": round(cov_data.quantile(0.25), round_precision),
"cov_median": round(cov_data.median(), round_precision),
"cov_q3": round(cov_data.quantile(0.75), round_precision),
"cov_iqr": round(cov_data.quantile(0.75) - cov_data.quantile(0.25), round_precision),
"cov_stdev": round(cov_data.std(), round_precision),
"cov_mad": round(mad_val, round_precision),
"cov_percent_above_4x": percentage_greater_than_4x(df, cov_col, round_precision),
"cov_evenness_score": calculate_evenness_score(df, cov_col, round_precision)
Comment on lines +39 to +47
Copy link

Copilot AI Jun 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

JSON key names (e.g. cov_mean, cov_q1, etc.) don’t match the example output (mean_cov, q1_cov, etc.). Update the keys to align with the documented format.

Suggested change
"cov_mean": round(mean_val, round_precision),
"cov_q1": round(cov_data.quantile(0.25), round_precision),
"cov_median": round(cov_data.median(), round_precision),
"cov_q3": round(cov_data.quantile(0.75), round_precision),
"cov_iqr": round(cov_data.quantile(0.75) - cov_data.quantile(0.25), round_precision),
"cov_stdev": round(cov_data.std(), round_precision),
"cov_mad": round(mad_val, round_precision),
"cov_percent_above_4x": percentage_greater_than_4x(df, cov_col, round_precision),
"cov_evenness_score": calculate_evenness_score(df, cov_col, round_precision)
"mean_cov": round(mean_val, round_precision),
"q1_cov": round(cov_data.quantile(0.25), round_precision),
"median_cov": round(cov_data.median(), round_precision),
"q3_cov": round(cov_data.quantile(0.75), round_precision),
"iqr_cov": round(cov_data.quantile(0.75) - cov_data.quantile(0.25), round_precision),
"stdev_cov": round(cov_data.std(), round_precision),
"mad_cov": round(mad_val, round_precision),
"percent_above_4x_cov": percentage_greater_than_4x(df, cov_col, round_precision),
"evenness_score_cov": calculate_evenness_score(df, cov_col, round_precision)

Copilot uses AI. Check for mistakes.
}

# Replace Nan values with null
for key, value in statistics.items():
if pd.isna(value):
statistics[key] = "null"
Comment on lines +50 to +53
Copy link

Copilot AI Jun 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assigning the string "null" will produce a JSON string instead of a true null. Use None so that json.dump serializes it as a JSON null.

Suggested change
# Replace Nan values with null
for key, value in statistics.items():
if pd.isna(value):
statistics[key] = "null"
# Replace NaN values with None
for key, value in statistics.items():
if pd.isna(value):
statistics[key] = None

Copilot uses AI. Check for mistakes.

return statistics


def percentage_greater_than_4x(
df: pd.DataFrame, cov_col: int, round_precision: int
) -> float:
"""
Calculate the percentage of coverage values greater than 4x.
@param df: Dataframe containing coverage values
@param cov_col: Column containing coverage values
@param round_precision: Rounding precision
@return:
"""
logging.debug("Calculating percentage of coverage values greater than 4x")
total_bases = len(df)
bases_above_4x = df[df.iloc[:, cov_col - 1] > 4].shape[0]
percent_above_4x = round(bases_above_4x / total_bases, round_precision)

logging.info("Percentage of coverage values greater than 4x: %s", percent_above_4x)

return percent_above_4x


def calculate_evenness_score(
df: pd.DataFrame, cov_col: int, round_precision: int) -> float:
"""
Calculate the evenness score.
Konrad Oexle, Journal of Human Genetics 2016, Evaluation of the evenness score in NGS.
https://www.nature.com/articles/jhg201621

@param df: Dataframe containing coverage values
@param cov_col: Column containing coverage values
@param round_precision: Rounding precision
@return:
"""
logging.debug("Calculating evenness score")
mean_coverage = df.iloc[:, cov_col - 1].mean()
# Get the coverage values that are less than or equal to the mean coverage
d2 = df[df.iloc[:, cov_col - 1] <= mean_coverage].iloc[:, cov_col - 1].tolist()
# count of coverage values that are less than or equal to the mean coverage
d2_count = len(d2)
# sum of coverage values that are less than or equal to the mean coverage
d2_sum = sum(d2)
Comment on lines +93 to +97
Copy link

Copilot AI Jun 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Converting the series to a list and then summing in Python is less efficient. Consider using vectorized operations: d2 = cov_data[cov_data <= mean_coverage], then d2_count = d2.size and d2_sum = d2.sum().

Suggested change
d2 = df[df.iloc[:, cov_col - 1] <= mean_coverage].iloc[:, cov_col - 1].tolist()
# count of coverage values that are less than or equal to the mean coverage
d2_count = len(d2)
# sum of coverage values that are less than or equal to the mean coverage
d2_sum = sum(d2)
d2 = df[df.iloc[:, cov_col - 1] <= mean_coverage].iloc[:, cov_col - 1]
# count of coverage values that are less than or equal to the mean coverage
d2_count = d2.size
# sum of coverage values that are less than or equal to the mean coverage
d2_sum = d2.sum()

Copilot uses AI. Check for mistakes.
# total number of coverages
coverage_count = len(df)

logging.debug("Mean coverage: %s", mean_coverage)
logging.debug("D2 count: %s", d2_count)
logging.debug("D2 sum: %s", d2_sum)
logging.debug("Coverage count: %s", coverage_count)

if mean_coverage != 0:
evenness_score = round(1 - (d2_count - (d2_sum / mean_coverage)) / coverage_count, round_precision)
else:
logging.warning("Mean coverage is zero. Evenness score will be set to null.")
evenness_score = "null"

logging.info("Evenness score: %s", evenness_score)

return evenness_score


def open_file(file_path: str) -> pd.DataFrame:
"""
Open a file and return a dataframe
@param file_path: Path to the file
@return:
"""

if file_path.endswith(".gz"):
with gzip.open(file_path, "rt") as f:
df = pd.read_csv(f, sep="\t", header=None)
else:
with open(file_path, "rt") as f:
df = pd.read_csv(f, sep="\t", header=None)

return df


def main():
args = parse_arguments()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)

logging.info("Arguments: %s", args)
logging.info("Calculating coverage statistics")

prefix = args.output_prefix if args.output_prefix else args.mosdepth_regions.split(".")[0]

df: pd.DataFrame = open_file(args.mosdepth_regions)
logging.info("Opened file: %s", args.mosdepth_regions)

logging.debug("Dataframe shape: %s", df.shape)
logging.debug("Dataframe columns: %s", df.columns)
logging.debug(f"Dataframe Head\n{df.head()}")

statistics = calculate_summary_statistics(df, args.cov_col, args.round)

logging.info("Summary statistics: %s", statistics)

summary_file = f"{prefix}.cov_stat_summary.json"

logging.info("Writing summary statistics to file: %s", summary_file)
with open(summary_file, "w") as f:
json.dump(statistics, f)
f.write("\n")


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions docker/lr-mosdepth/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ dependencies:
- pyvcf=0.6.8=py36_0
- readline=8.1=h27cfd23_0
- samtools=1.11=h6270b1f_0
- numpy=1.19.5
- pandas=1.1.5
- setuptools=52.0.0=py36h06a4308_0
- six=1.15.0=py36h06a4308_0
- sqlite=3.35.4=hdfb4753_0
Expand Down
Loading