Phil Lawrence on 23 Jan 2004 17:13:46 -0000


[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

Re: seek with (<>) ?


Mark Dominus wrote:
> [lotso good stuff]

Thanks for the details.  I especially found the code sample and this
explanation *very* helpful:

> Here if the input is already a file, we use the file it's in.  This
> handles all three cases:
>
>         pivot_dump | col_width     # data is copied to /tmp/cw12345
>         col_width foo              # data is not copied
>         col_width < foo            # data is not copied
>
> The second case has $input_file defined, and the third case has
> $input_file undefined but -f STDIN true.

I did end up going with a tempfile solution, though by using the above
explanation I see I can smarten it up a bit...

I think this will be the end goal:
  1. if rewindable, just do that
  2. else buffer up to a certain memory usage (default or
    passed in value), and then switch to tempfile solution

#2 will take some research, I don't yet know if I can/how to check
memory consumption of my @slurp variable on the fly.

Anyway, attached for the list's enjoyment is my script for formatting
delimited data.  Works pretty nice.

prl




#!/usr/local/bin/perl
use warnings;
use diagnostics;
use strict;

use Getopt::Long;
use File::Temp qw/ tempfile /;

###
# parm defaults and validation
###
my $in_delim       = "\t";
my $out_delim;
my $fixed_width;
my $var_width;
my %out_col_format = ();
my $out_spacer;
GetOptions (
	"in_delimiter|delimiter|d=s"            => \$in_delim,

	"out_delimiter|od=s"                    => \$out_delim,
	"fixed_width|fw:i"                      => \$fixed_width,      # standalone flag, or can take a max width
	"variable_width|var_width|vw:i"         => \$var_width,        # standalone flag, or can take a max width

	"out_column_format|out_cf|ocf=s"        => \%out_col_format,
	"out_spacer|out_s|os=s"                 => \$out_spacer,
);

die "Specified both --fixed_width and --variable_width"
  if
    defined $fixed_width
    and
    defined $var_width;

die "--out_column_format keys may be integers only"
  if grep /\D/, keys %out_col_format;

my $qr_in_delim = qr/$in_delim/;

###
# main logic
###

if (!defined $fixed_width and !defined $var_width)
{
	while (<>)
	{
		print format_line(
		        line => $_
		       ,qr_in_delim => $qr_in_delim
		       ,(
		          defined $out_delim
		            ? (out_delim => $out_delim)
		            : ()
		        )
		       ,(
		          keys %out_col_format
		            ? (out_col_format => \%out_col_format)
		            : ()
		        )
		       ,(
		          defined $out_spacer
		            ? (out_spacer => $out_spacer)
		            : ()
		        )
		      );
	}
	exit;
}

# Create tempfile which will be auto-deleted when program exits
my $fh = tempfile(UNLINK => 1);

my ($max_width, $in_max_width);
if (defined $fixed_width)
{
	if ($fixed_width > 0)
	{
		$max_width = $fixed_width;
	}
	else
	{
		$in_max_width = 0;
	}
}
if (defined  $var_width)
{
	if ($var_width > 0)
	{
		$max_width = $var_width;
	}
	$in_max_width = [];
}
		
for (<>)
{
	print $fh $_;

	# if we need to collect max width info...
	if (defined $in_max_width)
	{
		chomp;
		if (ref $in_max_width) # we're looking for the widths of the widest field for *each* column
		{
			my @in = map
			         {
			           my $idx = -1;
			           if (defined $out_col_format{++$idx})
			           {
			             $_ = sprintf $out_col_format{$idx}, $_;
			           }
			           length;
			         } split $qr_in_delim;
			for (0 .. $#in)
			{
				$in_max_width->[$_] = $in[$_]
				  if !defined $in_max_width->[$_]
				     or
				     $in[$_] > $in_max_width->[$_];
			}
		}
		else                   # we're looking for max width of all the fields
		{
			$in_max_width = (
			              sort { $b <=> $a }
			                   $in_max_width
			                  ,(
			                     sort { $b <=> $a }
			                          map { length }
			                              split $qr_in_delim
			                   )[0]
			            )[0];
		}
	}
}

my $out_col_width;
if (defined $fixed_width)
{
	$out_col_width = (
	                   defined $max_width
	                   and
	                   $max_width < $in_max_width
	                 )
	               ? $max_width
	               : $in_max_width;
}

if (defined $var_width)
{
	$out_col_width = defined $max_width
	               ? [ map { $max_width < $_ ? $max_width : $_ } @$in_max_width ]
	               : $in_max_width;
}

seek $fh,0,0;

while (<$fh>)
{
	print format_line(
	        line => $_
	       ,qr_in_delim => $qr_in_delim
	       ,out_col_width => $out_col_width
	       ,(
	          defined $out_delim
	            ? (out_delim => $out_delim)
	            : ()
	        )
	       ,(
	          keys %out_col_format
	            ? (out_col_format => \%out_col_format)
	            : ()
	        )
	       ,(
	          defined $out_spacer
	            ? (out_spacer => $out_spacer)
	            : ()
	        )
	      );
}



###
# subroutines
###
sub format_line
{
	my %parms = @_;

	die "No line parm was passed" unless defined $parms{line};
	die "No qr_in_delim parm was passed" unless defined $parms{qr_in_delim};

	# if other args were passed in
	if (keys %parms > 2)
	{
		# split line into fields
		chomp $parms{line};
		my @fields = split /$parms{qr_in_delim}/, $parms{line};

		# apply formatting to fields as needed
		$fields[$_] = sprintf($parms{out_col_format}->{$_}, $fields[$_])
		  for keys %{ $parms{out_col_format} };

		# set col width if specified
		if (defined $parms{out_col_width})
		{
			if (ref $parms{out_col_width})
			{
				$fields[$_] = defined $fields[$_]
				            ? sprintf
				              (
				                '%' . $parms{out_col_width}->[$_] . 's'
				               ,(
				                  length $fields[$_] > $parms{out_col_width}->[$_]
				                    ? unpack 'A' . $parms{out_col_width}->[$_], $fields[$_]
				                    : $fields[$_]
				                )
				              )
				            : ' ' x $parms{out_col_width}->[$_]
				  for (0 .. $#{$parms{out_col_width}});
			}
			else
			{
				$fields[$_] = sprintf('%' . $parms{out_col_width} . 's' , $fields[$_])
				  for (0 .. $#fields);
			}
		}

		# reassemble the line
		$parms{line} = join
		               (
		                 $parms{out_delim}
		                .(
		                   defined $parms{out_spacer}
		                   ? $parms{out_spacer}
		                   : ''
		                 )
		                ,@fields
		               )
		              ."\n";
	}

	return $parms{line};
}