Listing 4 CSV2db::Update
package CSV2db::Update;
use strict;
use vars qw($VERSION);
use Class::Struct;
use IO::File;
use Text::CSV_XS;
use DBI;
$VERSION = substr q$Revision: 1.0 $, 10;
# attributes, with their default values (if any)
my %default =
(configfile => '',
dbhost => 'localhost',
dbname => undef,
dbid => undef,
dbpw => undef,
dbdname => 'mysql',
csvquotechar => '"',
csvfieldsep => ',',
verbose => 0,
fake => 0,
stdinpw => 0,
alarmt => 10,
_conf => undef,
);
# Use Class::Struct's struct() to create our standard accessor methods
# as well as the "new" method. (Note - this loop depends on all
# attributes being scalars.)
struct(map { ($_, '$') } keys %default);
# Regular expression to recognize numeric SQL types. These are types
# that should have NULL stored (represented as undef in DBI) instead
# of an empty string.
my($sqlnumberre) = '^(tinyint|smallint|mediumint|integer|int|bigint|' .
'float|double|real|decimal|numeric|datetime|date|timestamp|time|year)';
# Suffixes for renaming database tables
my($tempsuffix) = '_temp';
my($oldsuffix) = '_old';
# workhorse method
sub table {
my ($self, $table, $fh) = @_;
###
### Check arguments and assign defaults to any items still undefined
###
die("Require 'tablename' and 'filehandle' arguments to \"table\"")
unless (defined $table && defined $fh);
die("Second arg to \"table\" method must be an IO::File filehandle")
unless (ref $fh eq 'IO::File');
for (keys %default) {
$self->$_($default{$_}) unless defined $self->$_;
}
###
### Read the configuration file
###
$self->_readconfig();
###
### Assign options and pre-/post-process calls from the config file
###
# for convenience, grab them directly into a hash
my(%options) = %{$self->_conf->{options}}
if exists $self->_conf->{options};
# propagate the options
for (keys %options) {
print "Configuration file option '$_': $options{$_}\n"
if $self->verbose();
if ($self->can($_)) {
$self->$_($options{$_});
} else {
print "Bad configuration file option '$_' ignored\n";
}
}
# get and check lists of the pre- and post-process routine calls
my @preprocess = @{$self->_conf->{preprocess}}
if exists $self->_conf->{preprocess};
my @postprocess = @{$self->_conf->{postprocess}}
if exists $self->_conf->{postprocess};
for (@preprocess, @postprocess) {
die("Option file '" . $self->configfile() . "' preprocess and " .
"postprocess entries must\n\t" .
"all be code references (either 'sub {...},' or '\&function,')\n")
unless ref $_ eq 'CODE';
}
###
### Now, set up the table definitions from the config file
###
# Data structure from config file:
# tablename => [[fieldname, fieldtype, index, fulltextindex, sub]...]
# 0 1 2 3 4
# ignore field if fieldtype is empty;
# create index if index is true;
# use field in fulltextindex if fulltextindex is true;
# field #4 is _optional_ subroutine reference to run on this field.
# Note: to define primary key, do it as part of the fieldtype;
# use index only for regular indices.
# for convenience, grab the structure into a hash
my(%tables) = %{$self->_conf->{tables}};
die("Found no table definitions in the configuration file '" .
$self->configfile() . "'!\n")
unless (scalar keys %tables > 0);
# check for legal field names, force types to uppercase, check
# that field-specific subs are coderefs, and see if there are any
# "SET"-type definitions in the selected table
my $anysets = 0;
for my $thistable (keys %tables) {
for (@{$tables{$thistable}}) {
die("No field name can be longer than 64 characters!\n\t[$_]\n")
if length $_->[0] > 64;
die("Field names cannot contain '.' or '/'!\n\t[$_]\n")
if $_->[0] =~ m#\.|/#;
die("Field-specific processing routines must be coderefs\n" .
"(there's a '" . (ref $_->[4]) . "' instead)\n")
if (($_->[4]) && (ref $_->[4] ne 'CODE'));
$_->[1] = uc $_->[1]; # uppercase the type
++$anysets if (($_->[1] =~ /^SET$/i) && ($thistable eq $table));
}
}
# is our table name present in the configuration file?
die("No table '$table' is defined in config file '",
$self->configfile(), "'\n")
unless exists $tables{$table};
###
### Get our database pw from stdin, if needed, then hook us up
###
if ($self->stdinpw()) {
my $s;
eval {
local $SIG{ALRM} = sub {die "alarm\n" };
alarm $self->alarmt() if ($self->alarmt() > 0);
$s = <STDIN>;
alarm 0 if ($self->alarmt() > 0);
};
die($@ eq "alarm\n" ? "Timed out" : "Unknown cause") if ($@);
chomp $s;
$self->dbpw($s);
}
# hook up to the database
my $dbstring = "DBI" . ":" .
$self->dbdname() . ":" .
$self->dbname() . ":" .
$self->dbhost();
print "connect to '$dbstring'\n" if $self->verbose();
my $dbh;
die("Failed database connect.\n")
unless $dbh = DBI->connect("$dbstring", $self->dbid(), $self->dbpw(),
{AutoCommit => 1, RaiseError => 1});
###
### Perform preprocessing on the entire input data stream (if requested)
###
# For each preprocessing routine in order, we create a tmpfile,
# preprocess into it, then adopt that as our current data source,
# thus walking us through a series of tmpfiles.
# Each preprocess routine expects source and target IO filehandles.
for (@preprocess) {
my $temp = IO::File->new_tmpfile()
or die("Failed opening tempfile for preprocessing: $!\n");
$_->($fh, $temp);
$fh->close()
or die("Failed closing a preprocessed filehandle: $!\n");
$fh = $temp;
$fh->seek(0,0);
}
###
### Search for any MySQL SET-type values within the entire
### datafile. To get the list of items for each field, we
### exhaustively examine the entire input stream (rewinding it at
### the end). Then the SET definition(s) are rewritten to include
### their values.
###
if ($anysets) {
my(%setbits);
my @fields;
# examine all SET-type fields for their values
while ($self->_nextcsvline($fh, scalar @{$tables{$table}}, \@fields)) {
next if (scalar @fields != scalar @{$tables{$table}});
for my $tableconf (@{$tables{$table}}) {
my($field) = shift @fields;
# look only at fields with SET types from configuration file
if ($tableconf->[1] =~ /^SET$/i) {
# field-specific function for this field
$field = $tableconf->[4]->($field)
if exists $tableconf->[4];
# generic postprocess functions
for (@postprocess) {
$field = $_->($field);
}
# memorize each of the set content items
for (split(/,/, $field)) {
++$setbits{$tableconf->[0]}{$_};
}
}
}
}
$fh->seek(0,0); # rewind the input file
# rewrite field definition clauses to include the set items
for (@{$tables{$table}}) {
if ($_->[1] =~ /^SET$/i) {
$_->[1] = 'SET(';
$_->[1] .= "'" .
join("','", sort keys %{$setbits{$_->[0]}}) . "'"
if (scalar keys %{$setbits{$_->[0]}});
$_->[1] .= ')';
print "Rewrote set definition to $_->[1]\n"
if $self->verbose();
}
}
}
###
### Set up our SQL statements to create fields and indices
###
# create a name for the temporary table we will create
my $temptable = $table . $tempsuffix;
my $sql = "CREATE TABLE `$temptable` ( ";
# first, the field names and their types
for (@{$tables{$table}}) {
# if $_->[1] isn't defined, we ignore the incoming datafield
$sql .= "`$_->[0]` $_->[1], " if $_->[1];
}
$sql =~ s/,\s*$//; # kill trailing comma from list
# accumulate list of indexes to create
for (@{$tables{$table}}) {
$sql .= ", INDEX ( `$_->[0]` )" if $_->[2];
}
# create fulltext indexing, if requested
my($fullindexes) = '';
for (@{$tables{$table}}) {
if ($_->[3]) {
if ($_->[1] =~ /text|varchar/i) {
$fullindexes .= "`$_->[0]`, ";
} else {
print "'$_->[0]': no fulltext index (only text or varchar)\n";
}
}
}
$fullindexes =~ s/,\s*$//; # kill trailing comma from list
if ($fullindexes) {
$sql .= ", FULLTEXT `fulltext` ($fullindexes)";
}
# cap off the clause
$sql .= " )";
###
### Create the new table, and prepare to populate it
###
if (grep(/^$temptable$/, $dbh->tables())) {
print "do: DROP TABLE `$temptable`\n" if $self->verbose();
$dbh->do("DROP TABLE `$temptable`") unless $self->fake();
} else {
print "no temptable drop - it did not exist in the database\n";
}
print "do: $sql\n" if $self->verbose();
$dbh->do($sql) unless ($self->fake());
print "finished create\n" if $self->verbose();
$sql = "INSERT INTO `$temptable` ( ";
for (@{$tables{$table}}) {
$sql .= "`$_->[0]`, " if $_->[1];
}
$sql =~ s/,\s*$//; # trim trailing comma
$sql .= " ) VALUES ( ";
for (@{$tables{$table}}) {
$sql .= "?, " if $_->[1];
}
$sql =~ s/,\s*$//; # trim trailing comma
$sql .= " )";
print "prepare: $sql\n" if $self->verbose();
my $sth = $dbh->prepare($sql) unless ($self->fake());
###
### Step through each input CSV line, populating the new database.
### Do the field-specific and postprocessing functions along the way.
###
my @fields;
while ($self->_nextcsvline($fh, scalar @{$tables{$table}}, \@fields)) {
if (scalar @fields != scalar @{$tables{$table}}) {
# this was checked during parsing already, but what the hell
print "This record should have ", scalar @{$tables{$table}},
"but has ", scalar @fields,
". Skipped...\n\t", join("|", @fields), "\n";
next;
}
my(@out);
for (@{$tables{$table}}) {
my($field) = shift @fields;
# use only fields with non-null types from configuration file
if ($_->[1]) {
# field-specific function for this field
$field = $_->[4]->($field) if (exists $_->[4]);
# generic postprocess functions
for (@postprocess) {
$field = $_->($field);
}
# in DBI-ville, null numbers are set with undef, not ""
$field = undef
if ($field eq "" && $_->[1] =~ /$sqlnumberre/io);
push(@out, $field);
}
}
print scalar @out, " " if $self->verbose();
$sth->execute(@out) unless $self->fake();
}
print "\n" if $self->verbose();
$sth->finish unless $self->fake();
###
### We're done, so drop the old backup table, back up the existing
### table, and swap in the new one. Then quit the db connection.
###
if (grep(/^$table$oldsuffix$/, $dbh->tables())) {
print "do: DROP TABLE `$table$oldsuffix`\n" if $self->verbose();
$dbh->do("DROP TABLE `$table$oldsuffix`") unless $self->fake();
} else {
print "no backup table drop - it did not exist in the database\n";
}
if (grep(/^$table$/, $dbh->tables())) {
print "do: RENAME TABLE `$table` TO `$table$oldsuffix`\n"
if $self->verbose();
$dbh->do("RENAME TABLE `$table` TO `$table$oldsuffix`")
unless $self->fake();
} else {
print "no backup table created - it did not exist in the database\n";
}
print "do: RENAME TABLE `$temptable` TO `$table`\n" if $self->verbose();
$dbh->do("RENAME TABLE `$temptable` TO `$table`") unless $self->fake();
$dbh->disconnect;
print "Done\n" if $self->verbose();
1;
}
###
### Read the configuration file
###
sub _readconfig {
my($self) = shift @_;
# see if it really exists
die("Missing config file '", $self->configfile(), "'!\n")
if ( ! -e $self->configfile());
# make sure it's a real file
die("Config file '", $self->configfile(), "' must be regular file.\n")
unless (-f $self->configfile());
# mild permission checks
my($mode, $uid) = (stat($self->configfile()))[2,4];
die("Could not get file status on '", $self->configfile(), "': $!\n")
if ( ! defined $mode || ! defined $uid);
die("Config file must not be writable by world (only by owner/group)\n")
if ($mode & 002);
# read and parse the sucker
$self->_conf(do $self->configfile());
unless ($self->_conf) {
die("Failed parsing '", $self->configfile(), "': $@\n")
if $@;
die("Failed running '", $self->configfile(), "': $!\n")
unless defined $self->_conf;
die("Failure in configuration file '", $self->configfile(), "'\n");
}
}
###
### Return a lexically sorted list of the names of the tables
### from the object (i.e. that are in the config file).
###
sub tablenames {
my($self) = shift @_;
$self->_readconfig;
sort keys %{$self->_conf->{tables}};
}
###
### Suck data from the filehandle and render it into a single
### CSV-interpreted set of fields returned in @{$fieldsep}.
###
sub _nextcsvline {
my($self, $fh, $nfields, $fieldsp) = @_;
# start up a csv object, ready for parsing incoming text
my($csv) = Text::CSV_XS->new({binary => 1,
sep_char => $self->csvfieldsep(),
quote_char => $self->csvquotechar(),
});
# Here's the deal: some formats (notably tab-separated from
# Endnote) cheerily let linebreak characters exist within fields,
# as well as using them as record separators. Hence we read
# lines, gluing the first field of the subsequent line onto the
# last field of the preceeding line (if the field was split),
# until we get a whole record-full of fields. If, at that point,
# there are missing fields or spare fields left over, we assume we
# have an out-of-sync error and we're screwed. Along the way, we
# also deal with the possibility that records are simply split
# between lines (avoiding gluing fields in that case and just
# continuing to accumulate fields).
#
# But it gets worse. In the case where there is no quote
# character (thank you Endnote tab-separated) there is no possible
# way to distinguish a split field from a split record. So in
# that case, we always assume that a split record was split in the
# middle of a field and glue things together. Best we can do, I'm
# afraid.
#
# Sigh.
my(@fields) = ();
my($brokenfield) = 0;
return 0 if $fh->eof();
my $line = <$fh>;
unless ($csv->parse($line)) {
# if field was split, appending a quote will rescue us
print "line " . $fh->input_line_number .
": Rereading on possible split field error\n" if $self->verbose();
$csv->parse($line . $self->csvquotechar());
$brokenfield = 1;
}
if ($csv->status()) {
@fields = $csv->fields();
my @more = ();
while ((scalar @fields < $nfields || $brokenfield) &&
($line = <$fh>)) {
if ($brokenfield || $self->csvquotechar() eq '') {
# we're in the midst of a "\n"-broken single field
print "line " . $fh->input_line_number .
": Broken field - reading another line\n"
if $self->verbose();
if ($csv->parse($self->csvquotechar() . $line)) {
$brokenfield = 0; # field split once, and we cured it
} else {
# field split twice, we'll need to keep gluing
$csv->parse($self->csvquotechar() .
$line . $self->csvquotechar());
}
if ($csv->status()) {
@more = $csv->fields();
print "Got ", scalar @more, " more fields\n"
if $self->verbose();
$fields[$#fields] .= "\n" . shift @more;
} else {
die("line " . $fh->input_line_number .
": Failed reading broken field: $line\n");
}
} else {
# simply too few fields, try adding from next line
print "line " . $fh->input_line_number .
": Split record - reading another line ",
"(have ", scalar @fields, " of $nfields)\n"
if $self->verbose();
unless ($csv->parse($line)) {
# if field was split, appending a quote will rescue us
print "line " . $fh->input_line_number .
": Rerereading on a possible split field\n"
if $self->verbose();
$csv->parse($line . $self->csvquotechar());
$brokenfield = 1;
}
if ($csv->status()) {
@more = $csv->fields();
print "Got ", scalar @more, " more fields\n"
if $self->verbose();
} else {
die("line " . $fh->input_line_number .
": Failed trying to add fields to record: $line\n");
}
}
while (scalar @fields < $nfields && scalar @more > 0) {
push(@fields, shift @more);
}
}
# by now we should have a perfect record with no dangly bits
if (scalar @more > 0 || # unused bonus fields
scalar @fields != $nfields || # too few fields at the end
$brokenfield) { # unterminated very last field
die("Error on line " , $fh->input_line_number,
": Out-of-sync data. ",
scalar @more, " extra fields; have ",
scalar @fields, " of $nfields; ",
$brokenfield ? "broken field" : "not a broken field.\n",
"It happened around data line: ",
defined $line ? $line : "[last line of the file, I think]",
"\n");
}
} else {
die("Error on line " . $fh->input_line_number . ":\n\t" .
$csv->error_input . "\n");
}
@{$fieldsp} = @fields;
return 1;
}
1;
__END__
=pod
Following is a sample CGI program that can act as a file upload wrapper:
#!/usr/bin/perl -w
use strict;
use lib qw(/home/dean2/programs/CSV2db);
use CSV2db::Update;
use CSV2db::Util;
use CGI;
use CGI::Carp qw(fatalsToBrowser warningsToBrowser);
# start the proper protocol for outputting a web page
my $q = new CGI;
print $q->header(), $q->start_html(-title => 'Glossary Update',
-bgcolor => 'white',
);
# initialize our Update object and get the list of tables
my $update = CSV2db::Update->new(
configfile => 'glossary.config',
dbhost => 'localhost',
dbname => 'glossary',
dbid => 'glossarist',
dbpw => 'Murray',
dbdname => 'mysql',
verbose => 1,
fake => 0,
stdinpw => 0,
);
my @tables = $update->tablenames();
#
# either present the file-upload page or process uploaded files
#
if ( ! $q->param('button')) {
# no files uploaded yet, this is the first (print-form) call
print "<center>",
$q->start_multipart_form('POST', $ENV{SCRIPT_NAME} ?
$ENV{SCRIPT_NAME} : $q->url()),
"<table cellspacing=1 cellpadding=1 border=1>",
"<tr><td colspan='2'>",
"<center>Prepare comma-separated-value (CSV) files. ",
"<br>Enter one or more CSV file names into the appropriate ",
"blanks in the <b>File Name</b> column below, <br>then click ",
"the <b>Upload</b> button.</center>",
"</td></tr>",
"<tr><th>Table Name</th>",
"<th>File Name</th></tr>";
for (@tables) {
print "<tr><td align='right'>$_</td><td>",
$q->filefield(-name => 'file_' . $_,
-size => 40,
),
"</td></tr>\n";
}
print "<tr><td colspan=2 align='center'>",
$q->submit(-name => 'button',
-value => 'Upload CSV File(s)',
),
"</td></tr></table>", $q->endform(), "</center>";
} else {
# files have (allegedly) been uploaded, so let's process them
print "<pre>\n";
for my $table (@tables) {
my $fh = $q->upload('file_' . $table);
if (defined $fh) {
print "Processing table $table...<br>";
# this file copy is terribly lame, but it works until I
# can figure out how to convert an Fh object from
# CGI::upload into an IO::File handle.
my $iofh;
if ( ! ($iofh = IO::File->new_tmpfile())) {
print "Failed opening tempfile: $!\n";
} else {
$iofh->print($_) while (<$fh>);
$iofh->seek(0,0);
$update->table($table, $iofh);
}
} else {
print "Skipping table $table...\n";
}
}
print "Done</pre>\n";
}
print $q->end_html();
=cut
|