#!/usr/bin/env perl
#
#   A software toolkit for the interconversion of standard data models for phenotypic data
#
#   This file is part of Convert::Pheno
#
#   Last Modified: Apr/09/2024
#
#   $VERSION taken from Convert::Pheno
#
#   Copyright (C) 2022-2026 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
#
#   License: Artistic License 2.0

package main;

use strict;
use warnings;
use autodie;
use feature      qw(say);
use Data::Dumper;
use Sys::Hostname;
use POSIX                          qw(strftime);
use File::Spec::Functions          qw(catdir catfile);
use File::ShareDir::ProjectDistDir qw(dist_dir);
use FindBin                        qw($Bin);
use lib "$Bin/../lib";
use Term::ANSIColor qw(:constants);
use Convert::Pheno;
use Convert::Pheno::CLI::Args qw(build_cli_request);
use Convert::Pheno::IO::CSVHandler;
use Convert::Pheno::OMOP::Definitions;

$Data::Dumper::Sortkeys = 1;

# Defining a few variables
my $out_dir     = '.';
my $share_dir   = dist_dir('Convert-Pheno');
my $schema_file = catfile( $share_dir, 'schema', 'mapping.json' );

sub print_help {
    my $out = shift || *STDOUT;
    print {$out} <<'EOF';
Usage:
  convert-pheno -i <input-type> <infile> -o <output-type> <outfile> [options]
  convert-pheno -ipxf <infile> -obff <outfile> [options]
  convert-pheno -ipxf <infile> -obff --entities <list> --out-dir <dir> [options]

Common input flags:
  -i <type> <file>    Generic input form (pxf|bff|omop|redcap|cdisc|csv|openehr)
  -ibff <file>      Beacon v2 Models ('individuals' JSON|YAML)
  -ipxf <file>      Phenopackets v2 (JSON|YAML)
  -iomop <files...> OMOP-CDM CSV files or PostgreSQL dump
  -iopenehr <files...>
                    Experimental openEHR canonical JSON/YAML compositions
                    (currently EHRbase-oriented; input-only; BFF/PXF output)
  -iredcap <file>   REDCap export CSV
  -icdisc <file>    CDISC-ODM XML
  -icsv <file>      Raw CSV

Common output flags:
  -o <type> <file>    Generic output form (bff|pxf|omop|csv|jsonf|jsonld)
  -obff [file]      Beacon-oriented JSON/YAML output
                    Use -obff FILE for individuals-only BFF output
                    Use -obff with --entities --out-dir for entity-aware BFF
  -opxf <file>      Phenopackets v2 output
  -oomop            OMOP-CDM CSV table output (use with --out-dir)
  -ocsv <file>      Flattened CSV (with -ibff or -ipxf)
  -ojsonf <file>    Flattened 1D JSON/YAML (with -ibff or -ipxf)
  -ojsonld <file>   JSON-LD / YAML-LD (with -ibff or -ipxf)

Output behavior:
  --out-dir <dir>       Output directory (required for entity-aware BFF mode)
  --entities <list>     Beacon entities for BFF output [individuals]
                        Supported: individuals, biosamples, datasets, cohorts
                        biosamples are emitted from -ipxf when present, or
                        from -iomop when the SPECIMEN table is provided
                        datasets and cohorts are synthesized from individuals
                        Use with -obff and --out-dir
  --out-name k=file     Override one multi-file output name; repeat as needed
                        Use entity keys for BFF and table keys for OMOP
  -O                    Overwrite output file(s)

Mapping and search:
  --mapping-file <file>           Fields mapping YAML or JSON file
  --redcap-dictionary|-rcd <file> REDCap data dictionary CSV file
  --schema-file <file>            Alternative JSON Schema for mapping files
  --self-validate-schema|-svs     Self-validate the mapping JSON Schema
                                  (mainly for author/development checks)
  --search <type>                 Ontology label search mode [exact|mixed|fuzzy]
                                  Default: exact
  --text-similarity-method <m>    Token similarity for mixed/fuzzy search
                                  [cosine|dice], default: cosine
  --min-text-similarity-score <s> Minimum score for mixed/fuzzy search
                                  Default: 0.8
  --levenshtein-weight <w>        Levenshtein weight in fuzzy composite score
                                  Default: 0.1, range: 0-1
  --print-hidden-labels|-phl      Preserve original text labels in output
  --search-audit-tsv <file>       Write a TSV audit of ontology search results

OMOP:
  --ohdsi-db                      Enable Athena-OHDSI lookup
  --path-to-ohdsi-db <dir>        Directory containing <ohdsi.db>
  --omop-tables <list>            Restrict OMOP tables
  --exposures-file <file>         OMOP concept_id list treated as exposures
  --stream                        Incremental OMOP processing with single-file
                                  line-delimited JSON output
                                  Supports individuals-only <-obff FILE>, or
                                  entity-aware <-obff --entities individuals
                                  biosamples --out-dir DIR>
  --sql2csv                       Print SQL tables instead of converting
  --max-lines-sql <n>             Maximum SQL lines per table [500]

Other useful options:
  --default-vital-status <s>   Fallback PXF subject vitalStatus
                               [ALIVE|DECEASED|UNKNOWN_STATUS]
                               Used only with PXF output when no source
                               vitalStatus is available
  --separator|--sep <char>        Delimiter for CSV input [; for .csv]
  --username|-u <name>            Override username stored in metadata
  --test                          Skip time-varying metadata for stable tests
  --debug <level>                 Print internal request/debug data
                                  (level >= 2 also prints DB lookup summary)
  --verbose|-v                    Print progress information
  --version|-V
  --help
  --man

Examples:
  convert-pheno -i pxf pxf.json -o bff individuals.json
  convert-pheno -ipxf pxf.json -obff individuals.json
  convert-pheno -ipxf pxf.json -obff --entities biosamples --out-dir out/
  convert-pheno -ipxf pxf.json -obff --entities individuals biosamples --out-dir out/ --out-name biosamples=samples.json
  convert-pheno -ipxf pxf.json -obff --entities individuals datasets cohorts --out-dir out/
  convert-pheno -icsv data.csv --mapping-file mapping.yaml -obff --entities individuals datasets cohorts --out-dir out/
  convert-pheno -ibff individuals.json -oomop --out-dir omop_out/
  convert-pheno -ibff individuals.json -oomop --out-dir omop_out/ --out-name PERSON=patients.csv
  convert-pheno -iomop dump.sql.gz -opxf phenopackets.json.gz --default-vital-status UNKNOWN_STATUS --ohdsi-db
  convert-pheno -iomop dump.sql.gz -obff individuals.json.gz --stream --ohdsi-db
  convert-pheno -iomop dump.sql.gz -obff --stream --entities individuals biosamples --out-dir out/ --ohdsi-db
  convert-pheno -i openehr patient-set.json -o bff individual.json
  convert-pheno -i openehr patient-set.json -o pxf phenopacket.json

BFF output modes:
  -obff FILE keeps the individuals-only BFF behavior.
  -obff --entities ... --out-dir DIR writes one file per requested BFF entity.
  -oomop --out-dir DIR writes one file per emitted OMOP table.
  If a PXF input also contains biosamples, convert-pheno warns and preserves them
  under info.phenopacket.biosamples unless --entities is used.
  PXF output still defaults subject.vitalStatus to ALIVE unless a preserved
  source vitalStatus or --default-vital-status is available.
  openEHR input support is currently experimental and limited to BFF or PXF output.

Docs:
  Site:  https://cnag-biomedical-informatics.github.io/convert-pheno
  CLI:   https://cnag-biomedical-informatics.github.io/convert-pheno/use-as-a-command-line-interface/
  Install:
         https://cnag-biomedical-informatics.github.io/convert-pheno/download-and-installation/
EOF
}

sub print_usage_summary {
    my $out = shift || *STDERR;
    print {$out} <<'EOF';
Usage:
  convert-pheno -i <input-type> <infile> -o <output-type> <outfile> [options]
  convert-pheno -ipxf <infile> -obff <outfile> [options]
  convert-pheno -ipxf <infile> -obff --entities <list> --out-dir <dir> [options]

Run `convert-pheno --help` for full usage
EOF
}

sub usage_error {
    my ($message) = @_;
    print STDERR "Error: $message";
    print STDERR "\n" unless $message =~ /\n\z/;
    print STDERR "\n";
    print_usage_summary(*STDERR);
    exit 1;
}

sub print_man {
    print <<'EOF';
Full CLI documentation now lives in Markdown instead of POD.

Documentation site:
  https://cnag-biomedical-informatics.github.io/convert-pheno

CLI guide:
  https://cnag-biomedical-informatics.github.io/convert-pheno/use-as-a-command-line-interface/

Installation:
  https://cnag-biomedical-informatics.github.io/convert-pheno/download-and-installation/
EOF
}

my $cli = build_cli_request(
    argv        => \@ARGV,
    usage_error => \&usage_error,
    schema_file => $schema_file,
    out_dir     => $out_dir,
    color       => 1,
    stream      => 0,
    ohdsi_db    => 0,
);

if ( $cli->{action} eq 'help' ) { print_help(); exit 0; }
if ( $cli->{action} eq 'man' )  { print_man();  exit 0; }
if ( $cli->{action} eq 'version' ) { say "$0 Version $VERSION"; exit 0; }

my $data      = $cli->{data};
my $out_file  = $cli->{out_file};
my $log_file  = $cli->{log_file};
my $overwrite = $cli->{overwrite};
my $verbose   = $cli->{verbose};
my $stream    = $cli->{stream};
my $color     = $cli->{color};
my $log       = $data->{log};

$ENV{'ANSI_COLORS_DISABLED'} = 1 unless $color;

print Dumper $data if defined $data->{debug};

# Start printing to STDOUT
say BOLD CYAN program_header($VERSION), RESET if $verbose;

# Save log file if $log (before $data gets blessed)
write_log( $log_file, $data ) if defined $log;

#############################
# START DATA TRANSFORMATION #
#############################

convert( $out_file, $data );

###########################
# END DATA TRANSFORMATION #
###########################

sub convert {
    my ( $o_file, $l_data ) = @_;

    my $entity_mode =
      $l_data->{method} =~ /2bff$/
      && (
        @{ $l_data->{entities} || [] } != 1
        || ( $l_data->{entities}[0] || 'individuals' ) ne 'individuals'
      );

    maybe_warn_legacy_pxf_biosamples( $l_data, $entity_mode );
    prepare_output_targets( $l_data, $o_file, $entity_mode );

    # Start verbose
    print BOLD BLUE program_body($l_data), RESET if $verbose;

    # Creating object
    my $convert = Convert::Pheno->new($l_data);
    my ( $data, $bundle ) = execute_conversion( $convert, $l_data, $o_file, $entity_mode );
    write_conversion_result( $l_data, $o_file, $data, $bundle );

    # Finish
    print BOLD GREEN program_footer(), RESET if $verbose;
}

sub maybe_warn_legacy_pxf_biosamples {
    my ( $l_data, $entity_mode ) = @_;

    return if $entity_mode;
    return unless $l_data->{method} eq 'pxf2bff';
    return unless defined $l_data->{in_file};

    my $input = io_yaml_or_json(
        {
            filepath => $l_data->{in_file},
            mode     => 'read',
        }
    );

    my @items = ref($input) eq 'ARRAY' ? @$input : ($input);

    for my $item (@items) {
        next unless ref($item) eq 'HASH';
        my $phenopacket =
          exists $item->{phenopacket} ? $item->{phenopacket} : $item;
        next unless ref($phenopacket) eq 'HASH';
        next
          unless exists $phenopacket->{biosamples}
          && ref( $phenopacket->{biosamples} ) eq 'ARRAY'
          && @{ $phenopacket->{biosamples} };

        print STDERR <<'EOF';
Warning: input PXF contains biosamples. Single-file <-obff FILE> mode emits only <individuals>; biosamples were preserved under <info.phenopacket.biosamples>. Use <-obff --entities biosamples --out-dir DIR> or <-obff --entities individuals biosamples --out-dir DIR> for first-class biosample output.
EOF
        return;
    }

    return;
}

sub prepare_output_targets {
    my ( $l_data, $o_file, $entity_mode ) = @_;

    if ($entity_mode) {
        for my $entity ( @{ $l_data->{entities} } ) {
            my $entity_file = resolve_entity_output_file( $l_data, $entity );
            if ($overwrite) {
                unlink($entity_file) if -e $entity_file;
            }
            else {
                ask_overwrite($entity_file);
            }
        }
        return;
    }

    return if $l_data->{method} =~ /2omop$/;

    if ($overwrite) {
        unlink($o_file) if -e $o_file;
    }
    else {
        ask_overwrite($o_file);
    }
}

sub execute_conversion {
    my ( $convert, $l_data, $o_file, $entity_mode ) = @_;
    my $method = $l_data->{method};

    if ( $stream && $method eq 'omop2bff' ) {
        $convert->$method;
        return ( undef, undef );
    }

    if ($entity_mode) {
        my $bundle = $convert->_run_bundle_view;
        return ( undef, $bundle );
    }

    # For omop2bff and omop2pxf we serialize to JSON (not YAML) by individual.
    if ( $stream || $method eq 'omop2bff' || $method eq 'omop2pxf' ) {
        say BOLD GREEN "Writing <$o_file> file\n", RESET if $verbose;
        $convert->$method;
        return ( undef, undef );
    }

    my $data = $convert->$method;
    return ( $data, undef );
}

sub write_conversion_result {
    my ( $l_data, $o_file, $data, $bundle ) = @_;
    my $method = $l_data->{method};

    if ($bundle) {
        for my $entity ( @{ $l_data->{entities} } ) {
            my $entity_file = resolve_entity_output_file( $l_data, $entity );
            say BOLD GREEN "Writing <$entity_file> file\n", RESET if $verbose;
            io_yaml_or_json(
                {
                    filepath => $entity_file,
                    mode     => 'write',
                    data     => $bundle->entities($entity),
                }
            );
        }
        return;
    }

    return unless $data;

    say BOLD GREEN "Writing <$o_file> file\n", RESET if $verbose;

    if ( $method eq 'bff2csv' || $method eq 'pxf2csv' ) {
        write_csv(
            {
                sep      => $l_data->{sep} // ';',
                filepath => $o_file,
                headers  => get_headers($data),
                data     => $data,
            }
        );
        return;
    }

    if ( $method =~ /2omop$/ ) {
        for my $table ( sort keys %$data ) {
            my $filepath = resolve_omop_table_output_file( $l_data, $table );
            if ($overwrite) {
                unlink($filepath) if -e $filepath;
            }
            else {
                ask_overwrite($filepath);
            }
            my @headers  = @{ $omop_headers->{$table} };
            write_csv(
                {
                    sep      => ";",
                    filepath => $filepath,
                    headers  => \@headers,
                    data     => $data->{$table},
                }
            );
        }
        return;
    }

    io_yaml_or_json(
        {
            filepath => $o_file,
            mode     => 'write',
            data     => $data,
        }
    );
}

sub resolve_omop_table_output_file {
    my ( $l_data, $table ) = @_;
    return $l_data->{output_name_overrides}{$table}
      if exists $l_data->{output_name_overrides}
      && exists $l_data->{output_name_overrides}{$table};
    return catfile( $l_data->{out_dir}, $table . '.csv' );
}

sub program_header {
    my $version = shift;
    my $str     = <<EOF;
****************************************
*  Phenotypic Data Model Convert Tool  *
*          - CONVERT-PHENO -           *
*            Version: $version             *
*   (C) 2022-2026 Manuel Rueda, CNAG   *
*       The Artistic License 2.0       *
****************************************
EOF
    return $str;
}

sub program_footer {
    my $str = <<EOF;
All done!

EOF
    return $str;
}

sub program_body {
    my $l_data = shift;
    my $file =
      $l_data->{method} =~ m/^omop/
      ? join ',', @{ $l_data->{in_files} }
      : $l_data->{in_file};
    my $msg = <<EOF;
==== METHOD: <$l_data->{method}> ====
Processing: <$file>
EOF
    return $msg;
}

sub write_log {
    my ( $log, $data ) = @_;

    # Detecting the number of logical CPUs across different OSes
    my $os = $^O;
    chomp(
        my $threadshost =
          lc($os) eq 'darwin' ? qx{/usr/sbin/sysctl -n hw.logicalcpu}
        : lc($os) eq 'freebsd' ? qx{sysctl -n hw.ncpu}
        : $os eq 'MSWin32'     ? qx{wmic cpu get NumberOfLogicalProcessors}
        :                        qx{/usr/bin/nproc} // 1
    );

    # For the Windows command, the result will also contain the string
    # "NumberOfLogicalProcessors" which is the header of the output.
    # So we need to extract the actual number from it:
    if ( $os eq 'MSWin32' ) {
        ($threadshost) = $threadshost =~ /(\d+)/;
    }
    $threadshost = 0 + $threadshost;    # coercing it to be a number

    my $info = {
        date        => ( strftime "%a %b %e %H:%M:%S %Y", localtime ),
        threadshost => $threadshost,
        hostname    => hostname,
        id          => $data->{id},                                      # string
        version     => $VERSION,
             user   => $ENV{'LOGNAME'}
          || $ENV{'USER'}
          || $ENV{'USERNAME'}
          || 'dummy-user'
    };

    # Saving file
    say BOLD GREEN "Writing <$log> file\n" if $verbose;
    io_yaml_or_json(
        {
            filepath => $log,
            mode     => 'write',
            data     => { info => $info, data => $data }
        }
    );
}

sub ask_overwrite {
    my $filepath = shift;

    # Check if the file exists
    if ( -f $filepath ) {
        my $attempt_count = 0;
        my $max_attempts  = 5;

        while ( $attempt_count < $max_attempts ) {
            print BOLD RED "<$filepath> exists. Overwrite? [Y/n]: ";
            my $input = <STDIN>;
            chomp $input;

            if ( $input eq 'Y' ) {
                unlink($filepath)
                  or die "Failed to delete <$filepath>: $!\n";
                print RESET;
                return;    # Exit after successful deletion
            }
            elsif ( $input eq 'n' ) {
                die "Operation aborted by the user.\n";
            }
            else {
                say "Invalid input. Please enter 'Y' or 'n'.";
                $attempt_count++;
            }
        }
        die "Too many invalid attempts. Operation aborted.\n";
    }
}

sub resolve_entity_output_file {
    my ( $l_data, $entity ) = @_;
    return $l_data->{output_name_overrides}{$entity}
      if exists $l_data->{output_name_overrides}
      && exists $l_data->{output_name_overrides}{$entity};
    return catfile( $l_data->{out_dir}, $entity . '.json' );
}
