package Business::LegalRepData;
use strict;
use warnings;
our $VERSION = '0.03';

use URI::Escape;
use JSON;
use Data::Dumper;
use File::Spec;

=pod
USAGE:
my $whole_data_subfolder = "./echrapp_lawyers";
my $step = 1000;
my $range_order = $ARGV[0] or die "Must give the argument";
Business::LegalRepData::extract_json_range_number($whole_data_subfolder, $step, $range_order);
=cut



$| = 1;


sub extract_table_1_from_html{
	my $html_content = shift;
	my ($table) = $html_content =~ m{
		(<table\s+class="table\s+table-striped\s+table-borderless">.*?</table>)
	}xs;
	return $table;
}


sub extract_table_2_from_html{
	my $html_content = shift;
	my ($table) = $html_content =~ m{
		(<table\s+class="applications-index\s+table\s+table-hover">.*?</table>)
	}xs;
	return $table;
}


sub extract_rep_name_from_html{
	my $html_content = shift;
	my ($rep_name) = $html_content =~ m{
		<h1>([^<]*)</h1>
	}xs;
	return $rep_name;
}


sub extract_total_app_count_from_html {
    my $html_content = shift;
    my ($total_app_count) = $html_content =~ m{
        <b>\s*(\d+)\s*</b>\s*applications?\s+available\s+in\s+the\s+SOP\s+database\.
    }xsi;
    return $total_app_count;
}

sub extract_status_counts_array {
    my $html = shift;
    my @rows;
    while ( $html =~ m{
        <tr>\s*
        <td\b[^>]*>\s*([^<]+)\s*</td>\s*
        <td\b[^>]*>\s*([^<]+)\s*</td>\s*
        </tr>
    }xsg ) {
        push @rows, {
            key   => $1,
            value => $2,
        };
    }
	return \@rows;
}

sub extract_hashref_from_file{
	my $fn = shift;
	open FH, "<$fn" or die $!;
	my $html_content = do {local $/; <FH>};
	my $status_aohref = extract_status_counts_array($html_content);
	my $total_app_count = extract_total_app_count_from_html($html_content);
	my $rep_name = extract_rep_name_from_html($html_content);
	return {total_app_count=>$total_app_count, status_aohref=>$status_aohref, rep_name=>$rep_name,  };
}

sub extract_tables_rows_from_table_2{
	my $table_2 = shift;
	my @rows = $table_2 =~ m{
		(<tr\b.*?</tr>)
	}xsg;
	shift @rows;
	foreach my $row (@rows){
		get_href_from_trow_2($row);
	}
}






# Stopped here :

sub read_and_save_rawdata{
	my $out_json_file_name = shift;
	my $raw_data_subfolder = shift;
	my $first_rep_number = shift;
	my $last_rep_number = shift;
	printf "raw_data_subfolder = %s\n", $raw_data_subfolder;
    opendir(D, "$raw_data_subfolder") || die "Can't open directory $raw_data_subfolder: $!\n";
    my @file_list = readdir(D);
    my @aoh;
	my $count = 0;
    foreach my $entry (@file_list) {
		next unless $entry =~ /^(\d{5})\.html$/;
		my $rep_number = scalar $1;
        if($rep_number >= $first_rep_number){
			$count++;
			last if $rep_number > $last_rep_number;
			my $full_fn = File::Spec->catdir($raw_data_subfolder, $entry);
			# print "$full_fn\n";
			print ". ";
			my $href = extract_hashref_from_file($full_fn);
			$href->{rep_database_number} = $rep_number;
			push @aoh, $href;
        }
    }
    closedir(D);
	# print Dumper \@aoh;
	my $json_text = JSON->new->utf8->encode(\@aoh);
	open FH, ">$out_json_file_name" or die $!;
	print FH $json_text, "\n";
	close FH;
	printf "\nData output to %s\n", $out_json_file_name;			
}


# The following function generates array of ranges (as a hashref) by whole years, but not exactly - starting in december previous year and ending earlier.
# This is done to avoid some complexities.
sub generate_file_number_ranges{
	my $step = shift;
	my $last_possible_file_number = 38301;
	my @file_number_ranges;
	my $buffer_date;
	foreach (my $fst_number=1; $fst_number<=$last_possible_file_number; $fst_number += $step ){
		my $projected_lst_number = $fst_number + $step - 1 ;
		my $lst_number = $projected_lst_number > $last_possible_file_number ? $last_possible_file_number : $projected_lst_number ;
		push @file_number_ranges, {fst_number=>$fst_number, lst_number=>$lst_number} ;
	}
	printf "step = %d. Total ranges generated : %d\n", $step, scalar @file_number_ranges;
	return @file_number_ranges;
}

sub extract_json_range_number{
	my $whole_data_subfolder = shift;
	my $step = shift;
	my $range_order = shift;
	
	my @file_number_ranges = generate_file_number_ranges($step);
	print "range_order = $range_order\n";
	my $first_file_number = $file_number_ranges[$range_order-1]->{fst_number};
	my $last_file_number  = $file_number_ranges[$range_order-1]->{lst_number};
	my $json_filename = sprintf "%05d_%05d.json", $first_file_number, $last_file_number;
	printf "%d => %d. json_filename = %s\n", $first_file_number, $last_file_number, $json_filename ;
	my $full_json_filename  = $json_filename;
	my $output_folder = "echrapp_rep_data";
	my $full_json_filename = File::Spec->catdir($output_folder, $json_filename);
	read_and_save_rawdata($full_json_filename, $whole_data_subfolder, $first_file_number, $last_file_number);
}


=pod

sub get_href_from_trow_2{
	my $row = shift;
	my ($name) = $row =~ m{
		<td\s+class="name-col\s+d-none\s+d-sm-table-cell">\s*(.*?)\s*</td>
	}xs;
	my ($state) = $row =~ m{
	  <td\b[^>]*\bclass="[^"]*\bstate-col\b[^"]*"[^>]*>\s*
	  (?:<a\b[^>]*>)?\s*([^<]+)\s*(?:</a>)?
	  \s*</td>
	}xs;
	print "============= $name $state\n";
}

sub extract_aoh_from_html_content{
	my $html_content = shift;
	my $tbody = extract_tbody($html_content);
	my @tbodies = $tbody =~ /<tr>(.*?)<\/tr>/sg;
	my @aoh;
	foreach my $trow (@tbodies){
		my $hashref = get_hashref_from_trow($trow);
		push @aoh, $hashref;
		# printf "%s \n%s\n", $trow, '-' x 40 ;
	}
	return @aoh;
}

sub get_hashref_from_trow{
	my $trow = shift;
	my ($current_state)     = $trow =~ /<td class="current-state-col d-table-cell">([^<]+)<\/td>/g;
	my ($state)             = $trow =~ /<td class="state-col d-none d-sm-table-cell"><a href=[^<]+>([^<]+)<\/a><\/td>/g; # new
	my ($name)              = $trow =~ /<td class="name-col d-none d-sm-table-cell">([^<]+)<\/td>/g;
	my ($date)              = $trow =~ /<td class="date-col d-none d-sm-table-cell">([^<]+)<\/td>/g;
	my ($rep_id, $rep_name)              = $trow =~ /<td class="rep-col d-none d-lg-table-cell"><a href="\/\w+\/(\d*)">([^<]*)<\/a><\/td>/ ? ($1, $2) : ('', '');
	my ($app_database_id, $app_legal_id) = $trow =~ /<td class="number-col d-none d-sm-table-cell"><a href="\/applications\/(\d*)">([^<]*)<\/a><\/td>/ ? ($1, $2) : ('', '');
	
	my $hashref = {
		current_state => $current_state,
		name => $name,
		state=> $state,
		date => $date,
		rep_id => $rep_id,
		rep_name => $rep_name,
		app_database_id => $app_database_id,
		app_legal_id => $app_legal_id,
		};
	return $hashref;
}

sub extract_aoh_from_file{
	my $fn = shift;
	open FH, "<$fn" or die $!;
	my $html_content = do {local $/; <FH>};
	my @aoh = extract_aoh_from_html_content($html_content);

	# print Dumper \@aoh;
	return @aoh;
}

sub read_destination_folder_aohref{
	my $dir = shift;
	opendir(D, "$dir") || die "Can't open directory $dir: $!\n";
	my @file_list = readdir(D);
	my @aoh;
    foreach my $html_filename (@file_list) {
		next unless $html_filename =~ /\d{4}-\d{2}-\d{2}_(\d{4}-\d{2}-\d{2})_(\d{4})/;
		my $end_date_of_span = $1;
		my $page = $2;
		print ".";
		my $full_file_path = File::Spec->catdir($dir, $html_filename);
		my $metadata = {end_date_of_span=>$end_date_of_span, page=>$page, html_filename=>$html_filename};
		my @aoh_from_one_file = extract_aoh_from_file($full_file_path, $metadata);
		push @aoh, @aoh_from_one_file;
    }
	return \@aoh;
}

sub read_rawdata_by_month_dirs{
	my $whole_data_subfolder = shift;
	my $start_folder_prefix = shift;
	my $folder_count = shift;
	printf "whole_data_subfolder = %s\n", $whole_data_subfolder;
    opendir(D, "$whole_data_subfolder") || die "Can't open directory $whole_data_subfolder: $!\n";
    my @file_list = readdir(D);
    my @aoh;
	my $count = 0;
    foreach my $entry (@file_list) {
        if($entry ge $start_folder_prefix){
			$count++;
			last if $count > $folder_count;
			my $destination_folder = File::Spec->catdir($whole_data_subfolder, $entry);
            print $destination_folder, "\n";
            
			my $aohref = read_destination_folder_aohref($destination_folder);
			printf "count = %d\n", scalar @$aohref;
			
			my $json_text = JSON->new->utf8->encode($aohref);
			my $json_file_name = File::Spec->catdir($whole_data_subfolder, "$entry.json.txt");
			open FH, ">$json_file_name" or die $!;
			print FH $json_text, "\n";
			close FH;
			printf "Metadata output to %s\n", $json_file_name;			
        }
    }
    closedir(D);
	return @aoh;
}





sub get_aoh_from_one_json{
	my $whole_data_subfolder = shift;
	my $fn = shift;
	my $full_file_path = File::Spec->catdir($whole_data_subfolder, $fn);
	open( my $fh, '<', "$full_file_path" );
	my $json_text   = <$fh>;
	my $perl_scalar = decode_json( $json_text );
	# print Dumper $perl_scalar;
	close($fh);
	# printf "count = %d\n", scalar @$perl_scalar;
	return @$perl_scalar;
}

sub get_aoh_from_range_of_jsons{
	my $whole_data_subfolder = shift;
	
    opendir(D, "$whole_data_subfolder") || die "Can't open directory $whole_data_subfolder: $!\n";
    my @file_list = readdir(D);
    my @cumulative_aoh;
    foreach my $f (@file_list) {
		next if ( -d $f or $f !~ /\.json\.txt$/g);
		my @aoh = get_aoh_from_one_json($whole_data_subfolder, $f);
		printf "%d ", scalar @aoh;
		push @cumulative_aoh, @aoh;
		# print "$f ";
	}
	return @cumulative_aoh;
}
=cut

1;
