Listing 4 CSV2db::Update

package CSV2db::Update;

use strict;
use vars qw($VERSION);
use Class::Struct;
use IO::File;
use Text::CSV_XS;
use DBI;

$VERSION = substr q$Revision: 1.0 $, 10;

# attributes, with their default values (if any)
my %default =
    (configfile          => '',
     dbhost         => 'localhost',
     dbname         => undef,
     dbid         => undef,
     dbpw         => undef,
     dbdname             => 'mysql',
     csvquotechar     => '"',
     csvfieldsep     => ',',
     verbose         => 0,
     fake         => 0,
     stdinpw         => 0,
     alarmt         => 10,
     _conf               => undef,
    );

# Use Class::Struct's struct() to create our standard accessor methods
# as well as the "new" method.  (Note - this loop depends on all
# attributes being scalars.)
struct(map { ($_, '$') } keys %default);

# Regular expression to recognize numeric SQL types.  These are types
# that should have NULL stored (represented as undef in DBI) instead
# of an empty string.
my($sqlnumberre) = '^(tinyint|smallint|mediumint|integer|int|bigint|' .
    'float|double|real|decimal|numeric|datetime|date|timestamp|time|year)';

# Suffixes for renaming database tables
my($tempsuffix) = '_temp';
my($oldsuffix)  = '_old';

# workhorse method
sub table {
    my ($self, $table, $fh) = @_;

    ###
    ### Check arguments and assign defaults to any items still undefined
    ###
    die("Require 'tablename' and 'filehandle' arguments to \"table\"")
    unless (defined $table  &&  defined $fh);
    die("Second arg to \"table\" method must be an IO::File filehandle")
    unless (ref $fh eq 'IO::File');
    for (keys %default) {
    $self->$_($default{$_}) unless defined $self->$_;
    }

    ###
    ### Read the configuration file
    ###
    $self->_readconfig();

    ###
    ### Assign options and pre-/post-process calls from the config file
    ###
    # for convenience, grab them directly into a hash
    my(%options) = %{$self->_conf->{options}}
    if exists $self->_conf->{options};
    # propagate the options
    for (keys %options) {
    print "Configuration file option '$_': $options{$_}\n"
        if $self->verbose();
    if ($self->can($_)) {
        $self->$_($options{$_});
    } else {
        print "Bad configuration file option '$_' ignored\n";
    }
    }
    # get and check lists of the pre- and post-process routine calls
    my @preprocess = @{$self->_conf->{preprocess}}
    if exists $self->_conf->{preprocess};
    my @postprocess = @{$self->_conf->{postprocess}}
    if exists $self->_conf->{postprocess};
    for (@preprocess, @postprocess) {
    die("Option file '" . $self->configfile() . "' preprocess and " .
        "postprocess entries must\n\t" .
        "all be code references (either 'sub {...},' or '\&function,')\n")
        unless ref $_ eq 'CODE';
    }

    ###
    ### Now, set up the table definitions from the config file
    ###
    # Data structure from config file:
    #   tablename => [[fieldname, fieldtype, index, fulltextindex, sub]...]
    #                      0          1        2          3         4
    #   ignore field if fieldtype is empty;
    #   create index if index is true;
    #   use field in fulltextindex if fulltextindex is true;
    #   field #4 is _optional_ subroutine reference to run on this field.
    #   Note: to define primary key, do it as part of the fieldtype;
    #   use index only for regular indices.
    # for convenience, grab the structure into a hash
    my(%tables) = %{$self->_conf->{tables}};
    die("Found no table definitions in the configuration file '" .
    $self->configfile() . "'!\n")
    unless (scalar keys %tables > 0);
    # check for legal field names, force types to uppercase, check
    # that field-specific subs are coderefs, and see if there are any
    # "SET"-type definitions in the selected table
    my $anysets = 0;
    for my $thistable (keys %tables) {
    for (@{$tables{$thistable}}) {
        die("No field name can be longer than 64 characters!\n\t[$_]\n")
        if length $_->[0] > 64;
        die("Field names cannot contain '.' or '/'!\n\t[$_]\n")
        if $_->[0] =~ m#\.|/#;
        die("Field-specific processing routines must be coderefs\n" .
            "(there's a '" . (ref $_->[4]) . "' instead)\n")
        if (($_->[4])  &&  (ref $_->[4] ne 'CODE'));
        $_->[1] = uc $_->[1];  # uppercase the type
        ++$anysets if (($_->[1] =~ /^SET$/i)  &&  ($thistable eq $table));
    }
    }
    # is our table name present in the configuration file?
    die("No table '$table' is defined in config file '",
    $self->configfile(), "'\n")
    unless exists $tables{$table};

    ###
    ### Get our database pw from stdin, if needed, then hook us up
    ###
    if ($self->stdinpw()) {
    my $s;
    eval {
        local $SIG{ALRM} = sub {die "alarm\n" };
        alarm $self->alarmt() if ($self->alarmt() > 0);
        $s = <STDIN>;
        alarm 0 if ($self->alarmt() > 0);
    };
    die($@ eq "alarm\n"  ?  "Timed out"  :  "Unknown cause") if ($@);
    chomp $s;
    $self->dbpw($s);
    }
    # hook up to the database
    my $dbstring = "DBI"  . ":" .
    $self->dbdname()  . ":" .
    $self->dbname()   . ":" .
    $self->dbhost();
    print "connect to '$dbstring'\n" if $self->verbose();
    my $dbh;
    die("Failed database connect.\n")
    unless $dbh = DBI->connect("$dbstring", $self->dbid(), $self->dbpw(),
                   {AutoCommit => 1, RaiseError => 1});

    ###
    ### Perform preprocessing on the entire input data stream (if requested)
    ###
    # For each preprocessing routine in order, we create a tmpfile,
    # preprocess into it, then adopt that as our current data source,
    # thus walking us through a series of tmpfiles.
    # Each preprocess routine expects source and target IO filehandles.
    for (@preprocess) {
    my $temp = IO::File->new_tmpfile()
        or die("Failed opening tempfile for preprocessing: $!\n");
    $_->($fh, $temp);
    $fh->close()
        or die("Failed closing a preprocessed filehandle: $!\n");
    $fh = $temp;
    $fh->seek(0,0);
    }

    ###
    ### Search for any MySQL SET-type values within the entire
    ### datafile.  To get the list of items for each field, we
    ### exhaustively examine the entire input stream (rewinding it at
    ### the end).  Then the SET definition(s) are rewritten to include
    ### their values.
    ###
    if ($anysets) {
    my(%setbits);
    my @fields;
    # examine all SET-type fields for their values
    while ($self->_nextcsvline($fh, scalar @{$tables{$table}}, \@fields)) {
        next if (scalar @fields  !=  scalar @{$tables{$table}});
        for my $tableconf (@{$tables{$table}}) {
        my($field) = shift @fields;
        # look only at fields with SET types from configuration file
        if ($tableconf->[1] =~ /^SET$/i) {
            # field-specific function for this field
            $field = $tableconf->[4]->($field)
            if exists $tableconf->[4];
            # generic postprocess functions
            for (@postprocess) {
            $field = $_->($field);
            }
            # memorize each of the set content items
            for (split(/,/, $field)) {
            ++$setbits{$tableconf->[0]}{$_};
            }
        }
        }
    }
    $fh->seek(0,0);   # rewind the input file
    # rewrite field definition clauses to include the set items
    for (@{$tables{$table}}) {
        if ($_->[1] =~ /^SET$/i) {
        $_->[1] = 'SET(';
        $_->[1] .= "'" .
            join("','", sort keys %{$setbits{$_->[0]}}) . "'"
            if (scalar keys %{$setbits{$_->[0]}});
        $_->[1] .= ')';
        print "Rewrote set definition to $_->[1]\n"
            if $self->verbose();
        }
    }
    }

    ###
    ### Set up our SQL statements to create fields and indices
    ###
    # create a name for the temporary table we will create
    my $temptable = $table . $tempsuffix;
    my $sql = "CREATE TABLE `$temptable` ( ";
    # first, the field names and their types
    for (@{$tables{$table}}) {
    # if $_->[1] isn't defined, we ignore the incoming datafield
    $sql .= "`$_->[0]` $_->[1], " if $_->[1];
    }
    $sql =~ s/,\s*$//; # kill trailing comma from list
    # accumulate list of indexes to create
    for (@{$tables{$table}}) {
    $sql .= ", INDEX ( `$_->[0]` )" if $_->[2];
    }
    # create fulltext indexing, if requested
    my($fullindexes) = '';
    for (@{$tables{$table}}) {
    if ($_->[3]) {
        if ($_->[1] =~ /text|varchar/i) {
        $fullindexes .= "`$_->[0]`, ";
        } else {
        print "'$_->[0]': no fulltext index (only text or varchar)\n";
        }
    }
    }
    $fullindexes =~ s/,\s*$//; # kill trailing comma from list
    if ($fullindexes) {
    $sql .= ", FULLTEXT `fulltext` ($fullindexes)";
    }
    # cap off the clause
    $sql .= " )";

    ###
    ### Create the new table, and prepare to populate it
    ###
    if (grep(/^$temptable$/, $dbh->tables())) {
    print "do: DROP TABLE `$temptable`\n" if $self->verbose();
    $dbh->do("DROP TABLE `$temptable`") unless $self->fake();
    } else {
    print "no temptable drop - it did not exist in the database\n";
    }
    print "do: $sql\n" if $self->verbose();
    $dbh->do($sql) unless ($self->fake());
    print "finished create\n" if $self->verbose();
    $sql = "INSERT INTO `$temptable` ( ";
    for (@{$tables{$table}}) {
    $sql .= "`$_->[0]`, " if $_->[1];
    }
    $sql =~ s/,\s*$//;  # trim trailing comma
    $sql .= " ) VALUES ( ";
    for (@{$tables{$table}}) {
    $sql .= "?, "  if $_->[1];
    }
    $sql =~ s/,\s*$//;  # trim trailing comma
    $sql .= " )";
    print "prepare: $sql\n" if $self->verbose();
    my $sth = $dbh->prepare($sql) unless ($self->fake());

    ###
    ### Step through each input CSV line, populating the new database.
    ### Do the field-specific and postprocessing functions along the way.
    ###
    my @fields;
    while ($self->_nextcsvline($fh, scalar @{$tables{$table}}, \@fields)) {
    if (scalar @fields  !=  scalar @{$tables{$table}}) {
        # this was checked during parsing already, but what the hell
        print "This record should have ", scalar @{$tables{$table}},
        "but has ", scalar @fields,
        ". Skipped...\n\t", join("|", @fields), "\n";
        next;
    }
    my(@out);
    for (@{$tables{$table}}) {
        my($field) = shift @fields;
        # use only fields with non-null types from configuration file
        if ($_->[1]) {
        # field-specific function for this field
        $field = $_->[4]->($field) if (exists $_->[4]);
        # generic postprocess functions
        for (@postprocess) {
            $field = $_->($field);
        }
        # in DBI-ville, null numbers are set with undef, not ""
        $field = undef
            if ($field eq ""  &&  $_->[1] =~ /$sqlnumberre/io);
        push(@out, $field);
        }
    }
    print scalar @out, " " if $self->verbose();
    $sth->execute(@out) unless $self->fake();
    }
    print "\n" if $self->verbose();
    $sth->finish unless $self->fake();

    ###
    ### We're done, so drop the old backup table, back up the existing
    ### table, and swap in the new one.  Then quit the db connection.
    ###
    if (grep(/^$table$oldsuffix$/, $dbh->tables())) {
    print "do: DROP TABLE `$table$oldsuffix`\n" if $self->verbose();
    $dbh->do("DROP TABLE `$table$oldsuffix`") unless $self->fake();
    } else {
    print "no backup table drop - it did not exist in the database\n";
    }
    if (grep(/^$table$/, $dbh->tables())) {
    print "do: RENAME TABLE `$table` TO `$table$oldsuffix`\n"
        if $self->verbose();
    $dbh->do("RENAME TABLE `$table` TO `$table$oldsuffix`")
        unless $self->fake();
    } else {
    print "no backup table created - it did not exist in the database\n";
    }
    print "do: RENAME TABLE `$temptable` TO `$table`\n" if $self->verbose();
    $dbh->do("RENAME TABLE `$temptable` TO `$table`") unless $self->fake();
    $dbh->disconnect;

    print "Done\n" if $self->verbose();

    1;
}


###
### Read the configuration file
###
sub _readconfig {
    my($self) = shift @_;

    # see if it really exists
    die("Missing config file '", $self->configfile(), "'!\n")
    if ( ! -e $self->configfile());
    # make sure it's a real file
    die("Config file '", $self->configfile(), "' must be regular file.\n")
    unless (-f $self->configfile());
    # mild permission checks
    my($mode, $uid) = (stat($self->configfile()))[2,4];
    die("Could not get file status on '", $self->configfile(), "': $!\n")
    if ( ! defined $mode  ||  ! defined $uid);
    die("Config file must not be writable by world (only by owner/group)\n")
    if ($mode & 002);
    # read and parse the sucker
    $self->_conf(do $self->configfile());
    unless ($self->_conf) {
    die("Failed parsing '", $self->configfile(), "': $@\n")
        if $@;
    die("Failed running '", $self->configfile(), "': $!\n")
        unless defined $self->_conf;
    die("Failure in configuration file '", $self->configfile(), "'\n");
    }
}

###
### Return a lexically sorted list of the names of the tables
### from the object (i.e. that are in the config file).
###
sub tablenames {
    my($self) = shift @_;
    $self->_readconfig;
    sort keys %{$self->_conf->{tables}};
}

###
### Suck data from the filehandle and render it into a single
### CSV-interpreted set of fields returned in @{$fieldsep}.
###
sub _nextcsvline {
    my($self, $fh, $nfields, $fieldsp) = @_;

    # start up a csv object, ready for parsing incoming text
    my($csv) = Text::CSV_XS->new({binary     => 1,
                  sep_char   => $self->csvfieldsep(),
                  quote_char => $self->csvquotechar(),
                 });

    # Here's the deal: some formats (notably tab-separated from
    # Endnote) cheerily let linebreak characters exist within fields,
    # as well as using them as record separators.  Hence we read
    # lines, gluing the first field of the subsequent line onto the
    # last field of the preceeding line (if the field was split),
    # until we get a whole record-full of fields.  If, at that point,
    # there are missing fields or spare fields left over, we assume we
    # have an out-of-sync error and we're screwed.  Along the way, we
    # also deal with the possibility that records are simply split
    # between lines (avoiding gluing fields in that case and just
    # continuing to accumulate fields).
    #
    # But it gets worse.  In the case where there is no quote
    # character (thank you Endnote tab-separated) there is no possible
    # way to distinguish a split field from a split record.  So in
    # that case, we always assume that a split record was split in the
    # middle of a field and glue things together.  Best we can do, I'm
    # afraid.
    #
    # Sigh.

    my(@fields) = ();
    my($brokenfield) = 0;

    return 0 if $fh->eof();
    my $line = <$fh>;
    unless ($csv->parse($line)) {
    # if field was split, appending a quote will rescue us
    print "line " . $fh->input_line_number .
        ": Rereading on possible split field error\n" if $self->verbose();
    $csv->parse($line . $self->csvquotechar());
    $brokenfield = 1;
    }
    if ($csv->status()) {
    @fields = $csv->fields();
    my @more = ();
    while ((scalar @fields < $nfields  ||  $brokenfield)  &&
           ($line = <$fh>)) {
        if ($brokenfield  ||  $self->csvquotechar() eq '') {
        # we're in the midst of a "\n"-broken single field
        print "line " . $fh->input_line_number .
            ": Broken field - reading another line\n"
            if $self->verbose();
        if ($csv->parse($self->csvquotechar() . $line)) {
            $brokenfield = 0; # field split once, and we cured it
        } else {
            # field split twice, we'll need to keep gluing
            $csv->parse($self->csvquotechar() .
                $line . $self->csvquotechar());
        }
        if ($csv->status()) {
            @more = $csv->fields();
            print "Got ", scalar @more, " more fields\n"
            if $self->verbose();
            $fields[$#fields] .= "\n" . shift @more;
        } else {
            die("line " . $fh->input_line_number .
            ": Failed reading broken field: $line\n");
        }
        } else {
        # simply too few fields, try adding from next line
        print "line " . $fh->input_line_number .
            ": Split record - reading another line ",
            "(have ", scalar @fields, " of $nfields)\n"
            if $self->verbose();
        unless ($csv->parse($line)) {
            # if field was split, appending a quote will rescue us
            print "line " . $fh->input_line_number .
            ": Rerereading on a possible split field\n"
                if $self->verbose();
            $csv->parse($line . $self->csvquotechar());
            $brokenfield = 1;
        }
        if ($csv->status()) {
            @more = $csv->fields();
            print "Got ", scalar @more, " more fields\n"
            if $self->verbose();
        } else {
            die("line " . $fh->input_line_number .
            ": Failed trying to add fields to record: $line\n");
        }
        }
        while (scalar @fields < $nfields  &&  scalar @more > 0) {
        push(@fields, shift @more);
        }
    }
    # by now we should have a perfect record with no dangly bits
    if (scalar @more  >  0          || # unused bonus fields
        scalar @fields != $nfields  || # too few fields at the end
        $brokenfield) {    # unterminated very last field
        die("Error on line " , $fh->input_line_number,
        ": Out-of-sync data.  ",
        scalar @more, " extra fields; have ",
        scalar @fields, " of $nfields; ",
        $brokenfield ? "broken field" : "not a broken field.\n",
        "It happened around data line: ",
        defined $line ? $line : "[last line of the file, I think]",
        "\n");
    }
    } else {
    die("Error on line " . $fh->input_line_number . ":\n\t" .
        $csv->error_input . "\n");
    }
    @{$fieldsp} = @fields;
    return 1;
}


1;

__END__

=pod
Following is a sample CGI program that can act as a file upload wrapper:

 #!/usr/bin/perl -w
 use strict;
 use lib qw(/home/dean2/programs/CSV2db);
 use CSV2db::Update;
 use CSV2db::Util;
 use CGI;
 use CGI::Carp qw(fatalsToBrowser warningsToBrowser);

 # start the proper protocol for outputting a web page
 my $q = new CGI;
 print $q->header(), $q->start_html(-title   => 'Glossary Update',
                    -bgcolor => 'white',
                   );

 # initialize our Update object and get the list of tables
 my $update = CSV2db::Update->new(
                  configfile => 'glossary.config',
                  dbhost     => 'localhost',
                  dbname     => 'glossary',
                  dbid       => 'glossarist',
                  dbpw       => 'Murray',
                  dbdname    => 'mysql',
                  verbose    => 1,
                  fake        => 0,
                  stdinpw    => 0,
                 );
 my @tables = $update->tablenames();

 #
 # either present the file-upload page or process uploaded files
 #
 if ( ! $q->param('button')) {
     # no files uploaded yet, this is the first (print-form) call
     print "<center>",
     $q->start_multipart_form('POST', $ENV{SCRIPT_NAME} ?
                  $ENV{SCRIPT_NAME} : $q->url()),
         "<table cellspacing=1 cellpadding=1 border=1>",
     "<tr><td colspan='2'>",
     "<center>Prepare comma-separated-value (CSV) files. ",
     "<br>Enter one or more CSV file names into the appropriate ",
     "blanks in the <b>File Name</b> column below, <br>then click ",
     "the <b>Upload</b> button.</center>",
     "</td></tr>",
     "<tr><th>Table Name</th>",
     "<th>File Name</th></tr>";
     for (@tables) {
     print "<tr><td align='right'>$_</td><td>",
         $q->filefield(-name => 'file_' . $_,
               -size => 40,
              ),
         "</td></tr>\n";
     }
     print "<tr><td colspan=2 align='center'>",
     $q->submit(-name => 'button',
            -value => 'Upload CSV File(s)',
           ),
     "</td></tr></table>", $q->endform(), "</center>";

 } else {

     # files have (allegedly) been uploaded, so let's process them
     print "<pre>\n";
     for my $table (@tables) {
     my $fh = $q->upload('file_' . $table);
     if (defined $fh) {
         print "Processing table $table...<br>";
         # this file copy is terribly lame, but it works until I
         # can figure out how to convert an Fh object from
         # CGI::upload into an IO::File handle.
         my $iofh;
         if ( ! ($iofh = IO::File->new_tmpfile())) {
         print "Failed opening tempfile: $!\n";
         } else {
         $iofh->print($_) while (<$fh>);
         $iofh->seek(0,0);
         $update->table($table, $iofh);
         }
     } else {
         print "Skipping table $table...\n";
     }
     }
     print "Done</pre>\n";
 }

 print $q->end_html();

=cut