#!/usr/bin/env perl
use strict;
use warnings;
use autodie;
use FileCache maxopen => 128;
use Time::HiRes qw/gettimeofday tv_interval/;
if ( not defined $ARGV[0] or not grep { $ARGV[0] == $_ } (11,12,21,22,31,32,41,42,51,52) ) {
print <<EOF;
arg:
11 - in-memory sort. sequantial
12 - in-memory sort. random
21 - open-close in each iteration. sequatial
22 - open-close in each iteration. random
31 - open-close once. maybe die. sequantial
32 - open-close once. maybe die. random
41 - use cacheout. sequantial
42 - use cacheout. random
51 - use cacheout. sequantial
52 - use cacheout. maybe error. random
EOF
exit 1;
}
my $input_file = ( $ARGV[0] % 10 == 1 ) ? 'input_seq.txt' : 'input_rnd.txt';
my $output_postfix = ( $ARGV[0] % 10 == 1 ) ? 'seq.txt' : 'rnd.txt';
my $method = int($ARGV[0] / 10);
if ( $method == 1 ) {
print "method 1 - in-memory sort. [$input_file]\n";
my $t0 = [ gettimeofday ];
open my $in, '<', $input_file;
my @array = <$in>;
close $in;
print " laptime time after loading file : ", tv_interval($t0), "\n";
my %freq = ();
foreach my $line ( @array ) {
my $key = (split /\s+/, $line)[0];
$freq{$key}++;
}
print " laptime time after counting : ", tv_interval($t0), "\n";
my @sorted =
map { $_->[1] }
sort { $freq{$b->[0]} <=> $freq{$a->[0]} or $a->[0] <=> $b->[0] }
map { [ (split /\s+/, $_)[0], $_ ] }
@array;
print " laptime time after sorting : ", tv_interval($t0), "\n";
my $output_file = "sorted_1_$output_postfix";
open my $out, ">", $output_file;
foreach ( @sorted ) {
print {$out} $_;
}
close $out;
print "method 1 - save [$output_file]\n";
print " total elapsed time : ", tv_interval($t0), "\n";
}
elsif ( $method == 2 ) {
print "method 2 - open-close in each iteration. [$input_file]\n";
my $t0 = [ gettimeofday ];
my %freq = ();
open my $in, '<', $input_file;
while ( my $line = <$in> ) {
my $key = (split /\s+/, $line)[0];
$freq{$key}++;
open my $tmp, '>>', $key;
print {$tmp} $line;
close $tmp;
}
close $in;
print " laptime time after loading, counting, splitting : ", tv_interval($t0), "\n";
my $output_file = "sorted_2_$output_postfix";
open my $out, ">", $output_file;
foreach my $key ( sort { $freq{$b} <=> $freq{$a} or $a <=> $b } keys %freq ) {
open my $tmp, '<', $key;
while ( my $line = <$tmp> ) {
print {$out} $line;
}
close $tmp;
unlink $key;
}
close $out;
print "method 2 - save [$output_file]\n";
print " total elapsed time : ", tv_interval($t0), "\n";
}
elsif ( $method == 3 ) {
print "method 3 - open-close once. [$input_file]\n";
my $t0 = [ gettimeofday ];
my %freq = ();
my %fh = ();
open my $in, '<', $input_file;
while ( my $line = <$in> ) {
my $key = (split /\s+/, $line)[0];
$freq{$key}++;
if ( not exists $fh{$key} ) {
open $fh{$key}, '>', $key;
}
print {$fh{$key}} $line;
}
close $in;
close $_ foreach ( values %fh );
print " laptime time after loading, counting, splitting : ", tv_interval($t0), "\n";
my $output_file = "sorted_3_$output_postfix";
open my $out, ">", $output_file;
foreach my $key ( sort { $freq{$b} <=> $freq{$a} or $a <=> $b } keys %freq ) {
open my $tmp, '<', $key;
while ( my $line = <$tmp> ) {
print {$out} $line;
}
close $tmp;
unlink $key;
}
close $out;
print "method 3 - save [$output_file]\n";
print " total elapsed time : ", tv_interval($t0), "\n";
}
elsif ( $method == 4 ) {
no strict 'refs';
use IO::Handle;
print "method 4 - use cacheout. [$input_file]\n";
my $t0 = [ gettimeofday ];
my %freq = ();
my %fh = ();
open my $in, '<', $input_file;
while ( my $line = <$in> ) {
my $key = (split /\s+/, $line)[0];
$freq{$key}++;
$fh{$key} = cacheout "$key-t";
print {$fh{$key}} $line;
$fh{$key}->flush();
}
close $in;
print " laptime time after loading, counting, splitting : ", tv_interval($t0), "\n";
my $output_file = "sorted_4_$output_postfix";
open my $out, ">", $output_file;
foreach my $key ( sort { $freq{$b} <=> $freq{$a} or $a <=> $b } keys %freq ) {
open my $tmp, '<', "$key-t";
while ( my $line = <$tmp> ) {
print {$out} $line;
}
close $tmp;
unlink "$key-t";
}
close $out;
print "method 4 - save [$output_file]\n";
print " total elapsed time : ", tv_interval($t0), "\n";
}
elsif ( $method == 5 ) {
no strict 'refs';
print "method 5 - use cacheout (maybe error with random sequence). [$input_file]\n";
my $t0 = [ gettimeofday ];
my %freq = ();
my %fh = ();
open my $in, '<', $input_file;
while ( my $line = <$in> ) {
my $key = (split /\s+/, $line)[0];
$freq{$key}++;
if ( not exists $fh{$key} ) {
$fh{$key} = cacheout "$key.txt";
}
print {$fh{$key}} $line;
$fh{$key}->flush();
}
close $in;
print " laptime time after loading, counting, splitting : ", tv_interval($t0), "\n";
my $output_file = "sorted_5_$output_postfix";
open my $out, ">", $output_file;
foreach my $key ( sort { $freq{$b} <=> $freq{$a} or $a <=> $b } keys %freq ) {
open my $tmp, '<', "$key.txt";
while ( my $line = <$tmp> ) {
print {$out} $line;
}
close $tmp;
unlink "$key.txt";
}
close $out;
print "method 5 - save [$output_file]\n";
print " total elapsed time : ", tv_interval($t0), "\n";
}