feat(cli): provide human readable reporting in console output

qkaiser · qkaiser · commit 39ba033af67d · 2023-08-11T16:19:44.000+02:00
Provide some information about identified chunks, extracted content, and
encountered errors. All the information is obtained from the
ProcessResult object.

Example:

----
poetry run unblob --report /tmp/report.json -f -e /tmp/out sample.img
Extracted files: 3616
Extracted directories: 609
Extracted links: 782
Extraction directory size: 298.19 MB.
Chunks identification ratio: 90.74%
Chunks distribution
	- EXTFS: 150.00 MB (54.17%)
	- ELF32: 73.98 MB (26.72%)
	- UNKNOWN: 25.64 MB (9.26%)
	- FAT: 16.00 MB (5.78%)
	- LZO: 9.28 MB (3.35%)
	- XZ: 1.03 MB (0.37%)
	- TAR: 860.00 KB (0.30%)
	- BZIP2: 93.56 KB (0.03%)
	- GZIP: 24.21 KB (0.01%)
Encountered errors: 1
	- Severity.WARNING: MaliciousSymlinkRemoved
----
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@ build/
 .idea
 .coverage*
 /.venv/
+unblob.log
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -216,16 +216,18 @@ def test_archive_success(
     assert result.exit_code == 0
     assert "error" not in result.output
     assert "warning" not in result.output
+    log_path = Path("unblob.log")
     config = ExtractionConfig(
         extract_root=tmp_path,
         max_depth=expected_depth,
         entropy_depth=expected_entropy_depth,
         entropy_plot=bool(expected_verbosity >= 3),
         process_num=expected_process_num,
         handlers=BUILTIN_HANDLERS,
+        verbose=expected_verbosity,
     )
     process_file_mock.assert_called_once_with(config, in_path, None)
-    logger_config_mock.assert_called_once_with(expected_verbosity, tmp_path)
+    logger_config_mock.assert_called_once_with(expected_verbosity, tmp_path, log_path)
 
 
 @pytest.mark.parametrize(
diff --git a/unblob/cli.py b/unblob/cli.py
@@ -1,14 +1,19 @@
 #!/usr/bin/env python3
+import atexit
 import sys
 from pathlib import Path
-from typing import Iterable, Optional
+from typing import Dict, Iterable, List, Optional, Tuple
 
 import click
+import pkg_resources
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
 from structlog import get_logger
 
 from unblob.models import DirectoryHandlers, Handlers, ProcessResult
 from unblob.plugins import UnblobPluginManager
-from unblob.report import Severity
+from unblob.report import ChunkReport, Severity, StatReport, UnknownChunkReport
 
 from .cli_options import verbosity_option
 from .dependencies import get_dependencies, pretty_format_dependencies
@@ -25,6 +30,11 @@
 logger = get_logger()
 
 
+def restore_cursor():
+    # Restore cursor visibility
+    sys.stdout.write("\033[?25h")  # ANSI escape code to show cursor
+
+
 def show_external_dependencies(
     ctx: click.Context, _param: click.Option, value: bool  # noqa: FBT001
 ) -> None:
@@ -70,7 +80,7 @@ def __init__(
         handlers: Optional[Handlers] = None,
         dir_handlers: Optional[DirectoryHandlers] = None,
         plugin_manager: Optional[UnblobPluginManager] = None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(*args, **kwargs)
         handlers = handlers or BUILTIN_HANDLERS
@@ -157,6 +167,13 @@ def __init__(
     type=click.Path(path_type=Path),
     help="File to store metadata generated during the extraction process (in JSON format).",
 )
+@click.option(
+    "--log",
+    "log_path",
+    default=Path("unblob.log"),
+    type=click.Path(path_type=Path),
+    help="File to save logs (in text format). Defaults to unblob.log.",
+)
 @click.option(
     "-s",
     "--skip_extraction",
@@ -185,6 +202,7 @@ def cli(
     file: Path,
     extract_root: Path,
     report_file: Optional[Path],
+    log_path: Path,
     force: bool,  # noqa: FBT001
     process_num: int,
     depth: int,
@@ -198,7 +216,7 @@ def cli(
     plugin_manager: UnblobPluginManager,
     verbose: int,
 ) -> ProcessResult:
-    configure_logger(verbose, extract_root)
+    configure_logger(verbose, extract_root, log_path)
 
     plugin_manager.import_plugins(plugins_path)
     extra_handlers = plugin_manager.load_handlers_from_plugins()
@@ -219,10 +237,14 @@ def cli(
         handlers=handlers,
         dir_handlers=dir_handlers,
         keep_extracted_chunks=keep_extracted_chunks,
+        verbose=verbose,
     )
 
     logger.info("Start processing file", file=file)
-    return process_file(config, file, report_file)
+    process_results = process_file(config, file, report_file)
+    if verbose == 0:
+        print_report(process_results)
+    return process_results
 
 
 cli.context_class = UnblobContext
@@ -242,6 +264,108 @@ def get_exit_code_from_reports(reports: ProcessResult) -> int:
     return 0
 
 
+def human_size(size: float):
+    units = ["B", "KB", "MB", "GB", "TB"]
+    i = 0
+    while size >= 1024 and i < len(units) - 1:
+        size /= 1024
+        i += 1
+    return f"{size:.2f} {units[i]}"
+
+
+def get_chunks_distribution(task_results: List) -> Dict:
+    chunks_distribution = {"unknown": 0}
+    for task_result in task_results:
+        chunk_reports = [
+            report
+            for report in task_result.reports
+            if isinstance(report, (ChunkReport, UnknownChunkReport))
+        ]
+
+        for chunk_report in chunk_reports:
+            if isinstance(chunk_report, UnknownChunkReport):
+                chunks_distribution["unknown"] += chunk_report.size
+                continue
+            if chunk_report.handler_name not in chunks_distribution:
+                chunks_distribution[chunk_report.handler_name] = 0
+            chunks_distribution[chunk_report.handler_name] += chunk_report.size
+
+    return chunks_distribution
+
+
+def get_size_report(task_results: List) -> Tuple[int, int, int, int]:
+    total_files = 0
+    total_dirs = 0
+    total_links = 0
+    extracted_size = 0
+
+    for task_result in task_results:
+        stat_reports = list(
+            filter(lambda x: isinstance(x, StatReport), task_result.reports)
+        )
+        for stat_report in stat_reports:
+            total_files += stat_report.is_file
+            total_dirs += stat_report.is_dir
+            total_links += stat_report.is_link
+            if stat_report.is_file:
+                extracted_size += stat_report.size
+
+    return total_files, total_dirs, total_links, extracted_size
+
+
+def print_report(reports: ProcessResult):
+    total_files, total_dirs, total_links, extracted_size = get_size_report(
+        reports.results
+    )
+    chunks_distribution = get_chunks_distribution(reports.results)
+
+    valid_size = 0
+    total_size = 0
+    for handler, size in chunks_distribution.items():
+        if handler != "unknown":
+            valid_size += size
+        total_size += size
+
+    if total_size == 0:
+        return
+
+    summary = Panel(
+        f"""Extracted files: [#00FFC8]{total_files}[/#00FFC8]
+Extracted directories: [#00FFC8]{total_dirs}[/#00FFC8]
+Extracted links: [#00FFC8]{total_links}[/#00FFC8]
+Extraction directory size: [#00FFC8]{human_size(extracted_size)}[/#00FFC8]
+Chunks identification ratio: [#00FFC8]{(valid_size/total_size) * 100:0.2f}%[/#00FFC8]""",
+        subtitle="Summary",
+        title=f"unblob ({get_version()})",
+    )
+
+    console = Console()
+    console.print(summary)
+
+    chunks_table = Table(title="Chunks distribution")
+    chunks_table.add_column("Chunk type", justify="left", style="#00FFC8", no_wrap=True)
+    chunks_table.add_column("Size", justify="center", style="#00FFC8", no_wrap=True)
+    chunks_table.add_column("Ratio", justify="center", style="#00FFC8", no_wrap=True)
+
+    for handler, size in sorted(
+        chunks_distribution.items(), key=lambda item: item[1], reverse=True
+    ):
+        chunks_table.add_row(
+            handler.upper(), human_size(size), f"{(size/total_size) * 100:0.2f}%"
+        )
+
+    console.print(chunks_table)
+
+    if len(reports.errors):
+        errors_table = Table(title="Encountered errors")
+        errors_table.add_column("Severity", justify="left", style="cyan", no_wrap=True)
+        errors_table.add_column("Name", justify="left", style="cyan", no_wrap=True)
+
+        for error in reports.errors:
+            errors_table.add_row(str(error.severity), error.__class__.__name__)
+        console.print(errors_table)
+
+
 def main():
     try:
         # Click argument parsing
@@ -261,6 +385,8 @@ def main():
     except Exception:
         logger.exception("Unhandled exception during unblob")
         sys.exit(1)
+    finally:
+        atexit.register(restore_cursor)
 
     sys.exit(get_exit_code_from_reports(reports))
 
diff --git a/unblob/logging.py b/unblob/logging.py
@@ -1,6 +1,7 @@
 import logging
 import pdb  # noqa: T100
 import sys
+from logging.handlers import WatchedFileHandler
 from os import getpid
 from pathlib import Path
 from typing import Any
@@ -87,27 +88,67 @@ def filter_(_logger, _method_name: str, event_dict: structlog.types.EventDict):
     return filter_
 
 
-def configure_logger(verbosity_level: int, extract_root: Path):
-    log_level = logging.DEBUG if verbosity_level > 0 else logging.INFO
+def configure_logger(verbosity_level: int, extract_root: Path, log_path: Path):
+    if log_path.exists():
+        log_path.unlink()
+
+    log_level = logging.DEBUG if verbosity_level > 0 else logging.CRITICAL
+
     processors = [
+        structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
+    ]
+
+    shared_processors = [
         structlog.stdlib.add_log_level,
-        filter_debug_logs(verbosity_level),
+        filter_debug_logs(verbosity_level or 2),
         structlog.processors.TimeStamper(
             key="timestamp", fmt="%Y-%m-%d %H:%M.%S", utc=True
         ),
         pretty_print_types(extract_root),
         add_pid_to_log_message,
         structlog.processors.UnicodeDecoder(),
         structlog.processors.StackInfoRenderer(),
-        structlog.processors.format_exc_info,
-        structlog.dev.ConsoleRenderer(colors=sys.stdout.isatty()),
     ]
 
     structlog.configure(
-        wrapper_class=structlog.make_filtering_bound_logger(log_level),
-        processors=processors,
+        wrapper_class=structlog.make_filtering_bound_logger(logging.DEBUG),
+        processors=shared_processors + processors,
+        logger_factory=structlog.stdlib.LoggerFactory(),
+    )
+
+    formatter = structlog.stdlib.ProcessorFormatter(
+        foreign_pre_chain=shared_processors,
+        processors=[
+            structlog.stdlib.ProcessorFormatter.remove_processors_meta,
+            structlog.dev.ConsoleRenderer(
+                colors=sys.stdout.isatty(),
+                exception_formatter=structlog.dev.plain_traceback,
+            ),
+        ],
+    )
+
+    file_formatter = structlog.stdlib.ProcessorFormatter(
+        foreign_pre_chain=shared_processors,
+        processors=[
+            structlog.stdlib.ProcessorFormatter.remove_processors_meta,
+            structlog.dev.ConsoleRenderer(
+                colors=False, exception_formatter=structlog.dev.plain_traceback
+            ),
+        ],
     )
 
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(formatter)
+    console_handler.setLevel(log_level)
+
+    file_handler = WatchedFileHandler(log_path.as_posix())
+    file_handler.setFormatter(file_formatter)
+    file_handler.setLevel(logging.DEBUG)
+
+    root_logger = logging.getLogger()
+    root_logger.addHandler(console_handler)
+    root_logger.addHandler(file_handler)
+    root_logger.setLevel(logging.DEBUG)
     structlog.get_logger().debug(
         "Logging configured",
         vebosity_level=noformat(verbosity_level),
diff --git a/unblob/processing.py b/unblob/processing.py
@@ -7,6 +7,8 @@
 import attr
 import magic
 import plotext as plt
+from rich import progress
+from rich.style import Style
 from structlog import get_logger
 from unblob_native import math_tools as mt
 
@@ -91,6 +93,7 @@ class ExtractionConfig:
     extract_suffix: str = "_extract"
     handlers: Handlers = BUILTIN_HANDLERS
     dir_handlers: DirectoryHandlers = BUILTIN_DIR_HANDLERS
+    verbose: int = 1
 
     def get_extract_dir_for(self, path: Path) -> Path:
         """Return extraction dir under root with the name of path."""
@@ -143,7 +146,26 @@ def _process_task(config: ExtractionConfig, task: Task) -> ProcessResult:
     processor = Processor(config)
     aggregated_result = ProcessResult()
 
+    if not config.verbose:
+        progress_display = progress.Progress(
+            progress.TextColumn(
+                "Extraction progress: {task.percentage:>3.0f}%",
+                style=Style(color="#00FFC8"),
+            ),
+            progress.BarColumn(
+                complete_style=Style(color="#00FFC8"), style=Style(color="#002060")
+            ),
+        )
+        progress_display.start()
+        overall_progress_task = progress_display.add_task("Extraction progress:")
+
     def process_result(pool, result):
+        if config.verbose == 0 and progress_display.tasks[0].total is not None:
+            progress_display.update(
+                overall_progress_task,
+                advance=1,
+                total=progress_display.tasks[0].total + len(result.subtasks),
+            )
         for new_task in result.subtasks:
             pool.submit(new_task)
         aggregated_result.register(result)
@@ -158,6 +180,10 @@ def process_result(pool, result):
         pool.submit(task)
         pool.process_until_done()
 
+    if not config.verbose:
+        progress_display.remove_task(overall_progress_task)
+        progress_display.stop()
+
     return aggregated_result
 
 
diff --git a/unblob/testing.py b/unblob/testing.py
@@ -15,7 +15,9 @@
 
 @pytest.fixture(scope="session", autouse=True)
 def configure_logging():  # noqa: PT004
-    configure_logger(verbosity_level=3, extract_root=Path(""))
+    configure_logger(
+        verbosity_level=3, extract_root=Path(""), log_path=Path("unblob.log")
+    )
 
     # https://pytest-cov.readthedocs.io/en/latest/subprocess-support.html#if-you-use-multiprocessing-process
     cleanup_on_sigterm()