#!/usr/bin/perl # # $Id: ovid2bib,v 1.44 2000/04/04 11:21:49 potse Exp $ # # Convert saved ovid information (ovid format) to BibTeX format # $usage = "USAGE: ovid2bib [-coa] infile.ovid [outfile.bib] OPTIONS: -c : copy ovid text as a comment to the bib file -o : write also nonstandard fields, except abstract field -a : write also the abstract, if present "; # # $Log: ovid2bib,v $ # Revision 1.44 2000/04/04 11:21:49 potse # fix # # Revision 1.43 2000/04/04 11:20:43 potse # handle articles from EMBASE too (different Source format) # # Revision 1.42 1999/09/10 08:49:23 potse # split author names on periods only, not on newlines # # Revision 1.41 1999/08/19 16:08:27 potse # ignore empty names # # Revision 1.40 1999/08/12 09:46:07 potse # fix for multiline authors field # # Revision 1.39 1999/05/14 10:57:14 potse # protect capitalization also in first word of the title, # because it may be all-caps. # # Revision 1.38 1999/05/14 10:54:33 potse # recognize source for incollections without named editor # # Revision 1.37 1999/05/14 10:13:36 potse # remove newlines from incollection title before trying to parse # # Revision 1.36 1999/05/14 09:56:51 potse # allow quotes in names # # Revision 1.35 1999/05/14 09:53:46 potse # fix for capitalization of words with hyphens and quoted words # # Revision 1.34 1999/05/14 09:46:31 potse # use incollection entry type rather than inbook # # Revision 1.33 1999/05/14 09:34:40 potse # remove trailing period and space from title # # Revision 1.32 1999/05/14 09:25:41 potse # translate "et al" in authors/editors to "others" # # Revision 1.31 1999/05/13 19:09:23 mark # fix for TeX special chars # # Revision 1.30 1999/05/13 19:00:52 mark # handle long fieldnames differently # # Revision 1.29 1999/05/13 18:37:00 mark # protect all uppercase words in the title, except the first # # Revision 1.28 1999/05/13 17:24:32 mark # fiddled with formats # # Revision 1.27 1999/05/13 16:37:01 mark # * let "keywords" be a primary field # * translate "Author e-mail Address" into "e-mail" # # Revision 1.26 1999/05/13 16:30:38 mark # handle unknown fields; use "Author Keywords" as "keywords" field # # Revision 1.25 1999/05/13 14:43:24 mark # pick pages and optional note from the title of inbook's # # Revision 1.24 1999/05/13 13:52:19 mark # * put name formatting in a function; # * format editors like authors # * put booktitle etc in primary fields # # Revision 1.23 1999/05/13 13:26:06 mark # handle "inbook" entries # # Revision 1.22 1999/05/13 12:42:03 mark # fixed author name splitting regexp # # Revision 1.21 1999/05/13 12:21:06 mark # handle institution like abstract (needed bug fix in # recognition of empty lines) # # Revision 1.20 1999/05/13 12:02:49 mark # fixed bug in abstract handling, and cleaned up # # Revision 1.19 1999/05/13 11:50:01 mark # implemented c, o, a options and reorganized a bit # # packages; use FileHandle; # for selecting formats use Getopt::Std; # command line option handling # settings: $: = " \n"; # don't break format ^ on hyphens, as is default $| = 1; # set autoflush (always fflush after print/write) # define the flags (see above in Usage) getopts('coa') or die $usage; # get filename arguments ($infile, $outfile) = @ARGV; if($#ARGV<0 || $#ARGV>1 ){ die $usage; } if(!$outfile){ $outfile = "$infile.bib"; } open INFILE, $infile or die "Can't open input file \"$infile\""; open OUTFILE, ">$outfile" or die "Can't open output file \"$outfile\""; $field = "unknown"; $count = 0; while(){ if(/^<([0-9]+)>/){ # new entry $nr = $1; $count++; if($count>1){ &output_entry; } # output previous entry %entry = (); # start new entry print "\n<$nr> "; } elsif (/^Unique Identifier/){ $field = "unique-id"; } elsif (/^Authors/) { $field = "author"; } elsif (/^Institution/) { $field = "institution"; } elsif (/^Title/) { $field = "title"; } elsif (/^Source/) { $field = "source"; } elsif (/^Abbreviated Source/){ # overrides Source $field = "source"; $entry{$field} = ""; } elsif (/^Abstract/){ $field = "abstract"; } elsif (/^Author Keywords/){ $field = "keywords"; } elsif (/^Author e-mail Address/){ $field = "e-mail"; } elsif (/^(\w.*)/){ # unknown field $field = $1; $field =~ tr/[A-Z ]/[a-z_]/; # lowercase and no spaces } elsif (/^\s*$/){ # empty line: append to field, but kill extra space $entry{$field} .= "\n"; } # order of these two is important! elsif (/^\s\s(.+)/){ # nonempty line starting with two spaces: $entry{$field} .= "$1\n"; # append to a field } else{ print "can't parse line: \"$_\""; } if($opt_c){ # copy original as comment print OUTFILE "% "; s/@/\\AT /g; # BibTeX doesn't like at-signs in comments print OUTFILE; } } &output_entry; # last entry print "\n"; close INFILE; close OUTFILE; # end of main program ######################################################################## # # Format names (authors, editors): # Put a period-space after each initial, but take # care of multiletter initials like "Th". Characters # allowed in names are: # # \w (word) # \s (space) as in "E. O. Robles de Medina" # - (hyphen) as in "F. J. Wilms-Schopman" # ' (right quote) as in "C. N. d'Alnoncourt" # # Translate "et al" into "others"; BibTeX recognizes that and # translates back in the way specified in the bibliography style. # sub format_names(@names) { my @names = @_; my @authors = (); foreach $name (@names){ if($name =~ /et al/){ push @authors, "others"; # BibTeX takes care of this }elsif($name =~ /^\s*$/){ # empty name (possibly newline char): ignore }else{ $name =~ /^\s*([\s\w-']+)\s+(\w+)$/; $last = $1; $inits = $2; $inits =~ s/([A-Z][a-z]*)/$1\. /g; push @authors, "$last, $inits"; } } return @authors; } ####################################################################### # # output one entry as a BibTeX @article or @incollection # (@incollection is a part of a book having its own title, according # to the manual [btxdoc.dvi], in contrast to @inbook). # sub output_entry { delete $entry{ignored}; # don't print the ignored fields foreach $field (keys %entry){ chop $entry{$field}; # chop final newline } # try to convert source to journal etc. if($entry{source} =~ /(.*)\.\s+(.*):(.*),\s*(\w*)(\s[\w\-]*)?(\s.*)?\./){ # # matches an article from MEDLINE # $bibtex_type = "article"; $entry{journal} = $1; $entry{volume} = $2; # volume and number! $entry{pages} = $3; $entry{year} = $4; $entry{month} = $5; $entry{date} = $6; # try to separate volume and number # the number may be absent, e.g. volume = "30 suppl." if($entry{volume} =~ /(.*)$(.*)$/){ $entry{volume} = $1; $entry{number} = $2; } }elsif($entry{source} =~ /(.*)\.\s+Vol\s+(.*)\s+$pp (.*)$,\s+(.*)\./){ # # matches an article from EMBASE # $bibtex_type = "article"; $entry{journal} = $1; $entry{volume} = $2; # volume and number! $entry{pages} = $3; $entry{year} = $4; # try to separate volume and number # the number may be absent, e.g. volume = "30 suppl." if($entry{volume} =~ /(.*)$(.*)$/){ $entry{volume} = $1; $entry{number} = $2; } }elsif($entry{source} =~ /In:\s*(.*),\sed\.\s+(.*)\.\s*(.*),\s*(.*),\s*([0-9]+)\.\s*(.*)$/){ # # matches a chapter in a book # $bibtex_type = "incollection"; $entry{editor} = $1; $entry{booktitle} = $2; $entry{address} = $3; $entry{publisher} = $4; $entry{year} = $5; $entry{locdata} = $6; }elsif($entry{source} =~ /In:\s*([^\.]*\.{1})\s*(.*),\s*(.*),\s*([0-9]+)\.\s*(.*)$/){ # # matches a chapter in a book without named editor # $bibtex_type = "incollection"; $entry{booktitle} = $1; $entry{address} = $2; $entry{publisher} = $3; $entry{year} = $4; $entry{locdata} = $5; }else{ print "Can't parse source field. "; # space at end! } # for incollections, the title field may contain pages and notes if($bibtex_type =~ /incollection/){ $entry{title} =~ tr/\n/ /; if($entry{title} =~ /(.*)pp\.\s([0-9\-]+)\.\s*(.*)?$/){ print "incollection "; $entry{title} = $1; $entry{pages} = $2; $entry{note} = $3; }else{ print "can't parse incollection title "; } }else{ print "article "; } # fix up: if(exists($entry{pages})){ $entry{pages} =~ s/-/--/g; # page ranges... } if(exists($entry{month})){ $entry{month} =~ s/\.//g; $entry{month} =~ s/-/--/g; $entry{month} =~ s/^\s*//g; # remove initial space } $date = $entry{date}; if(length($date)<1){ delete $entry{date}; } # remove empty field if(exists($entry{date})){ $entry{date} =~ s/^\s*//g; # remove initial space } if(exists($entry{title})){ $entry{title} =~ s/\.\s*$//; # remove trailing period and space } if(exists($entry{booktitle})){ $entry{booktitle} =~ s/\.\s*$//; # ditto } # # Format authors: Separate the names with # the string " and;", the semicolon is removed later. # In the ovid files, names are separated by periods. # if(exists($entry{author})){ @names = split '\.', $entry{author}; # split on periods @authors = format_names(@names); $entry{author} = join " and;", (@authors); } # the same for editors if(exists($entry{editor})){ @names = split ',', $entry{editor}; # split on commas! @editors = format_names(@names); $entry{editor} = join " and;", (@editors); } # # construct label from first author's last name and year # $tmp = $authors[0]; if($tmp =~ /(.*),/){ if(exists($entry{year})){ $label = "$1$entry{year}"; }else{ print "Missing year. "; $label = "$1:"; } $label =~ tr/[A-Z]/[a-z]/; # make lowercase $label =~ s/\s+//g; # remove spaces like in "de Bakker" }else{ print "Can't make label. "; $label = $entry{unique-id}; } $Labelcount{$label}++; # remember used labels if($Labelcount{$label}>1){ # already in use $label .= chr(95+$Labelcount{$label}); # a, b, ... } print "($label) "; # # Postprocessing of all fields: # * handle TeX's special characters foreach $field (keys %entry){ $entry{$field} =~ s/&/\\&{}/g; $entry{$field} =~ s/%/\\,\\%{}/g; $entry{$field} =~ s/\$/\\\${}/g; $entry{$field} =~ s/\^/\\\^{}/g; $entry{$field} =~ s/#/\\#{}/g; } # Protect capitalized and all-caps words in title, # even the first word (may be all-caps...). We assume that ovid files # have sound capitalization. # $entry{title} =~ s/\b([\w\-]*[A-Z][\w\-]*\b)/\{$1\}/g; print OUTFILE "\@$bibtex_type" . "{$label,\n"; # start writing the entry # write the fields: known fields first, then see what's # left in another foreach loop. # foreach $field ('author','title','journal','booktitle', 'editor','publisher','address','year', 'volume','number','month','date','pages', 'keywords'){ if(exists $entry{$field}){ write_field($field); delete $entry{$field}; } } if(exists $entry{abstract}){ $field = "abstract"; # must set $field! write_field($field) if $opt_a; # only if -a option given delete $entry{$field}; } if($opt_o){ # only if -o option given foreach $field (sort keys %entry){ write_field($field); } } print OUTFILE "}\n\n"; # finish entry } # # write a single field; special handling for some fields # sub write_field($field) { my $field = shift; OUTFILE->format_name("Bibfield"); # default format if(($field =~ "month") && ($entry{$field} =~ /^...$/)){ $value = "$entry{$field},"; # no braces for simple month $value =~ tr/[A-Z]/[a-z]/; # make lowercase write OUTFILE; }elsif($field =~ /(author)|(editor)/){ # # special formatting for authors/editors: # a special format is used, employing an array (@aut) # that is shift'ed to force one author on each line # $tmp = "{$entry{$field}},"; # attach braces and comma @aut = split ';', $tmp; $first_author = shift @aut; OUTFILE->format_name("Bibfield_author"); write OUTFILE; }elsif($field =~ /(abstract)|(institution)/){ # special formatting for $value = "{$entry{$field}},"; # multi-paragraph fields @abslines = split "\n\n", $value; $first_line = shift @abslines; OUTFILE->format_name("Bibfield_abstract_first"); # first paragraph write OUTFILE; OUTFILE->format_name("Bibfield_abstract"); # other paragraphs foreach $line (@abslines){ write OUTFILE; } }else{ if(length($field) > 18){ OUTFILE->format_name("Bibfield_long"); } $value = "{$entry{$field}},"; # attach braces and comma write OUTFILE; } } # # format definitions for the write's above: # format Bibfield = @<<<<<<<<<<<<<<<<= ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $field, $value ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ $value . # # for fieldnames longer than 18 characters # format Bibfield_long = @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $field, = ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ $value ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ $value . # # A more compact format for abstracts and institutions; # need two formats and a write for each paragraph to force newlines. # Note that there's an empty line in the second format. Each `line' is in # fact a paragraph. # format Bibfield_abstract_first = @<<<<<<<<<<<<<<<<= ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $field, $first_line ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~ $first_line ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ $first_line . format Bibfield_abstract = ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ $line . # # format for authors # format Bibfield_author = @<<<<<<<<<<<<<<<<= ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $field, $first_author ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ $first_author ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~ shift @aut .