|
[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]
Mark Dominus wrote:
> [lotso good stuff]
Thanks for the details. I especially found the code sample and this
explanation *very* helpful:
> Here if the input is already a file, we use the file it's in. This
> handles all three cases:
>
> pivot_dump | col_width # data is copied to /tmp/cw12345
> col_width foo # data is not copied
> col_width < foo # data is not copied
>
> The second case has $input_file defined, and the third case has
> $input_file undefined but -f STDIN true.
I did end up going with a tempfile solution, though by using the above
explanation I see I can smarten it up a bit...
I think this will be the end goal:
1. if rewindable, just do that
2. else buffer up to a certain memory usage (default or
passed in value), and then switch to tempfile solution
#2 will take some research, I don't yet know if I can/how to check
memory consumption of my @slurp variable on the fly.
Anyway, attached for the list's enjoyment is my script for formatting
delimited data. Works pretty nice.
prl
#!/usr/local/bin/perl
use warnings;
use diagnostics;
use strict;
use Getopt::Long;
use File::Temp qw/ tempfile /;
###
# parm defaults and validation
###
my $in_delim = "\t";
my $out_delim;
my $fixed_width;
my $var_width;
my %out_col_format = ();
my $out_spacer;
GetOptions (
"in_delimiter|delimiter|d=s" => \$in_delim,
"out_delimiter|od=s" => \$out_delim,
"fixed_width|fw:i" => \$fixed_width, # standalone flag, or can take a max width
"variable_width|var_width|vw:i" => \$var_width, # standalone flag, or can take a max width
"out_column_format|out_cf|ocf=s" => \%out_col_format,
"out_spacer|out_s|os=s" => \$out_spacer,
);
die "Specified both --fixed_width and --variable_width"
if
defined $fixed_width
and
defined $var_width;
die "--out_column_format keys may be integers only"
if grep /\D/, keys %out_col_format;
my $qr_in_delim = qr/$in_delim/;
###
# main logic
###
if (!defined $fixed_width and !defined $var_width)
{
while (<>)
{
print format_line(
line => $_
,qr_in_delim => $qr_in_delim
,(
defined $out_delim
? (out_delim => $out_delim)
: ()
)
,(
keys %out_col_format
? (out_col_format => \%out_col_format)
: ()
)
,(
defined $out_spacer
? (out_spacer => $out_spacer)
: ()
)
);
}
exit;
}
# Create tempfile which will be auto-deleted when program exits
my $fh = tempfile(UNLINK => 1);
my ($max_width, $in_max_width);
if (defined $fixed_width)
{
if ($fixed_width > 0)
{
$max_width = $fixed_width;
}
else
{
$in_max_width = 0;
}
}
if (defined $var_width)
{
if ($var_width > 0)
{
$max_width = $var_width;
}
$in_max_width = [];
}
for (<>)
{
print $fh $_;
# if we need to collect max width info...
if (defined $in_max_width)
{
chomp;
if (ref $in_max_width) # we're looking for the widths of the widest field for *each* column
{
my @in = map
{
my $idx = -1;
if (defined $out_col_format{++$idx})
{
$_ = sprintf $out_col_format{$idx}, $_;
}
length;
} split $qr_in_delim;
for (0 .. $#in)
{
$in_max_width->[$_] = $in[$_]
if !defined $in_max_width->[$_]
or
$in[$_] > $in_max_width->[$_];
}
}
else # we're looking for max width of all the fields
{
$in_max_width = (
sort { $b <=> $a }
$in_max_width
,(
sort { $b <=> $a }
map { length }
split $qr_in_delim
)[0]
)[0];
}
}
}
my $out_col_width;
if (defined $fixed_width)
{
$out_col_width = (
defined $max_width
and
$max_width < $in_max_width
)
? $max_width
: $in_max_width;
}
if (defined $var_width)
{
$out_col_width = defined $max_width
? [ map { $max_width < $_ ? $max_width : $_ } @$in_max_width ]
: $in_max_width;
}
seek $fh,0,0;
while (<$fh>)
{
print format_line(
line => $_
,qr_in_delim => $qr_in_delim
,out_col_width => $out_col_width
,(
defined $out_delim
? (out_delim => $out_delim)
: ()
)
,(
keys %out_col_format
? (out_col_format => \%out_col_format)
: ()
)
,(
defined $out_spacer
? (out_spacer => $out_spacer)
: ()
)
);
}
###
# subroutines
###
sub format_line
{
my %parms = @_;
die "No line parm was passed" unless defined $parms{line};
die "No qr_in_delim parm was passed" unless defined $parms{qr_in_delim};
# if other args were passed in
if (keys %parms > 2)
{
# split line into fields
chomp $parms{line};
my @fields = split /$parms{qr_in_delim}/, $parms{line};
# apply formatting to fields as needed
$fields[$_] = sprintf($parms{out_col_format}->{$_}, $fields[$_])
for keys %{ $parms{out_col_format} };
# set col width if specified
if (defined $parms{out_col_width})
{
if (ref $parms{out_col_width})
{
$fields[$_] = defined $fields[$_]
? sprintf
(
'%' . $parms{out_col_width}->[$_] . 's'
,(
length $fields[$_] > $parms{out_col_width}->[$_]
? unpack 'A' . $parms{out_col_width}->[$_], $fields[$_]
: $fields[$_]
)
)
: ' ' x $parms{out_col_width}->[$_]
for (0 .. $#{$parms{out_col_width}});
}
else
{
$fields[$_] = sprintf('%' . $parms{out_col_width} . 's' , $fields[$_])
for (0 .. $#fields);
}
}
# reassemble the line
$parms{line} = join
(
$parms{out_delim}
.(
defined $parms{out_spacer}
? $parms{out_spacer}
: ''
)
,@fields
)
."\n";
}
return $parms{line};
}
|
|