Perl/FileCache : GyparkWiki

#!/usr/bin/env perl
use strict;
use warnings;
use autodie;
use FileCache maxopen => 128;
use Time::HiRes qw/gettimeofday tv_interval/;

if ( not defined $ARGV[0] or not grep { $ARGV[0] == $_ } (11,12,21,22,31,32,41,42,51,52) ) {
    print <<EOF;
arg:
11 - in-memory sort. sequantial
12 - in-memory sort. random

21 - open-close in each iteration. sequatial
22 - open-close in each iteration. random

31 - open-close once. maybe die. sequantial
32 - open-close once. maybe die. random

41 - use cacheout. sequantial
42 - use cacheout. random

51 - use cacheout. sequantial
52 - use cacheout. maybe error. random
EOF
    exit 1;
}

my $input_file = ( $ARGV[0] % 10 == 1 ) ? 'input_seq.txt' : 'input_rnd.txt';
my $output_postfix = ( $ARGV[0] % 10 == 1 ) ? 'seq.txt' : 'rnd.txt';
my $method = int($ARGV[0] / 10);

if ( $method == 1 ) {       # in-memory sort
    print "method 1 - in-memory sort. [$input_file]\n";
    my $t0 = [ gettimeofday ];

    open my $in, '<', $input_file;
    my @array = <$in>;
    close $in;
    print " laptime time after loading file : ", tv_interval($t0), "\n";

    my %freq = ();
    foreach my $line ( @array ) {
        my $key = (split /\s+/, $line)[0];
        $freq{$key}++;
    }

    print " laptime time after counting : ", tv_interval($t0), "\n";

    my @sorted =
        map { $_->[1] }
        sort { $freq{$b->[0]} <=> $freq{$a->[0]} or $a->[0] <=> $b->[0] }
        map { [ (split /\s+/, $_)[0], $_ ] }
        @array;
    print " laptime time after sorting : ", tv_interval($t0), "\n";

    my $output_file = "sorted_1_$output_postfix";
    open my $out, ">", $output_file;
    foreach ( @sorted ) {
        print {$out} $_;
    }
    close $out;
    print "method 1 - save [$output_file]\n";
    print " total elapsed time : ", tv_interval($t0), "\n";

}
elsif ( $method == 2 ) {    # open-close in each iteration
    print "method 2 - open-close in each iteration. [$input_file]\n";
    my $t0 = [ gettimeofday ];

    my %freq = ();
    open my $in, '<', $input_file;
    while ( my $line = <$in> ) {
        my $key = (split /\s+/, $line)[0];
        $freq{$key}++;

        open my $tmp, '>>', $key;
        print {$tmp} $line;
        close $tmp;
    }
    close $in;
    print " laptime time after loading, counting, splitting : ", tv_interval($t0), "\n";

    my $output_file = "sorted_2_$output_postfix";
    open my $out, ">", $output_file;
    foreach my $key ( sort { $freq{$b} <=> $freq{$a} or $a <=> $b } keys %freq ) {
        open my $tmp, '<', $key;
        while ( my $line = <$tmp> ) {
            print {$out} $line;
        }
        close $tmp;
        unlink $key;
    }
    close $out;

    print "method 2 - save [$output_file]\n";
    print " total elapsed time : ", tv_interval($t0), "\n";
}
elsif ( $method == 3 ) {    # open-close once
    print "method 3 - open-close once. [$input_file]\n";
    my $t0 = [ gettimeofday ];

    my %freq = ();
    my %fh = ();
    open my $in, '<', $input_file;
    while ( my $line = <$in> ) {
        my $key = (split /\s+/, $line)[0];
        $freq{$key}++;

        if ( not exists $fh{$key} ) {
            open $fh{$key}, '>', $key;
        }
        print {$fh{$key}} $line;
    }
    close $in;
    close $_ foreach ( values %fh );
    print " laptime time after loading, counting, splitting : ", tv_interval($t0), "\n";

    my $output_file = "sorted_3_$output_postfix";
    open my $out, ">", $output_file;
    foreach my $key ( sort { $freq{$b} <=> $freq{$a} or $a <=> $b } keys %freq ) {
        open my $tmp, '<', $key;
        while ( my $line = <$tmp> ) {
            print {$out} $line;
        }
        close $tmp;
        unlink $key;
    }
    close $out;

    print "method 3 - save [$output_file]\n";
    print " total elapsed time : ", tv_interval($t0), "\n";
}
elsif ( $method == 4 ) {                      # cacheout
    no strict 'refs';
    use IO::Handle;
    print "method 4 - use cacheout. [$input_file]\n";
    my $t0 = [ gettimeofday ];

    my %freq = ();
    my %fh = ();
    open my $in, '<', $input_file;
    while ( my $line = <$in> ) {
        my $key = (split /\s+/, $line)[0];
        $freq{$key}++;

        $fh{$key} = cacheout "$key-t";
#         $fh{$key}->autoflush(1);
        print {$fh{$key}} $line;
        $fh{$key}->flush();
    }
    close $in;
    print " laptime time after loading, counting, splitting : ", tv_interval($t0), "\n";

#     if ( 1 ) { print "check file size!\n"; <STDIN>; }
    my $output_file = "sorted_4_$output_postfix";
    open my $out, ">", $output_file;
    foreach my $key ( sort { $freq{$b} <=> $freq{$a} or $a <=> $b } keys %freq ) {
        open my $tmp, '<', "$key-t";
        while ( my $line = <$tmp> ) {
            print {$out} $line;
        }
        close $tmp;
        unlink "$key-t";
    }
    close $out;

    print "method 4 - save [$output_file]\n";
    print " total elapsed time : ", tv_interval($t0), "\n";
}
elsif ( $method == 5 ) {
    no strict 'refs';
    print "method 5 - use cacheout (maybe error with random sequence). [$input_file]\n";
    my $t0 = [ gettimeofday ];

    my %freq = ();
    my %fh = ();
    open my $in, '<', $input_file;
    while ( my $line = <$in> ) {
        my $key = (split /\s+/, $line)[0];
        $freq{$key}++;

        if ( not exists $fh{$key} ) {
            $fh{$key} = cacheout "$key.txt";
        }
        print {$fh{$key}} $line;
        $fh{$key}->flush();
    }
    close $in;
    print " laptime time after loading, counting, splitting : ", tv_interval($t0), "\n";

    my $output_file = "sorted_5_$output_postfix";
    open my $out, ">", $output_file;
    foreach my $key ( sort { $freq{$b} <=> $freq{$a} or $a <=> $b } keys %freq ) {
        open my $tmp, '<', "$key.txt";
        while ( my $line = <$tmp> ) {
            print {$out} $line;
        }
        close $tmp;
        unlink "$key.txt";
    }
    close $out;

    print "method 5 - save [$output_file]\n";
    print " total elapsed time : ", tv_interval($t0), "\n";
}
Perl/FileCache

FileCache 코어 모듈

1. 테스트 기록

1.1. 테스트 상황

1.2. 메모리에 다 불러와서 정렬

1.3. 키 별로 개별 파일에 저장 후 병합

1.4. open 횟수를 줄이려는 시도

1.5. FileCache 모듈 사용

1.5.1. 핸들을 제 때 닫지 않아서 생기는 문제

1.6. exists 검사와 cacheout을 같이 쓰는 경우

2. Comments