#!/usr/bin/perl
#
# $Id: ovid2bib,v 1.44 2000/04/04 11:21:49 potse Exp $
#
# Convert saved ovid information (ovid format) to BibTeX format
#
$usage = "USAGE: ovid2bib [-coa] infile.ovid [outfile.bib]
OPTIONS:
  -c : copy ovid text as a comment to the bib file
  -o : write also nonstandard fields, except abstract field
  -a : write also the abstract, if present
";

#
# $Log: ovid2bib,v $
# Revision 1.44  2000/04/04 11:21:49  potse
# fix
#
# Revision 1.43  2000/04/04 11:20:43  potse
# handle articles from EMBASE too (different Source format)
#
# Revision 1.42  1999/09/10 08:49:23  potse
# split author names on periods only, not on newlines
#
# Revision 1.41  1999/08/19 16:08:27  potse
# ignore empty names
#
# Revision 1.40  1999/08/12 09:46:07  potse
# fix for multiline authors field
#
# Revision 1.39  1999/05/14 10:57:14  potse
# protect capitalization also in first word of the title,
# because it may be all-caps.
#
# Revision 1.38  1999/05/14 10:54:33  potse
# recognize source for incollections without named editor
#
# Revision 1.37  1999/05/14 10:13:36  potse
# remove newlines from incollection title before trying to parse
#
# Revision 1.36  1999/05/14 09:56:51  potse
# allow quotes in names
#
# Revision 1.35  1999/05/14 09:53:46  potse
# fix for capitalization of words with hyphens and quoted words
#
# Revision 1.34  1999/05/14 09:46:31  potse
# use incollection entry type rather than inbook
#
# Revision 1.33  1999/05/14 09:34:40  potse
# remove trailing period and space from title
#
# Revision 1.32  1999/05/14 09:25:41  potse
# translate "et al" in authors/editors to "others"
#
# Revision 1.31  1999/05/13 19:09:23  mark
# fix for TeX special chars
#
# Revision 1.30  1999/05/13 19:00:52  mark
# handle long fieldnames differently
#
# Revision 1.29  1999/05/13 18:37:00  mark
# protect all uppercase words in the title, except the first
#
# Revision 1.28  1999/05/13 17:24:32  mark
# fiddled with formats
#
# Revision 1.27  1999/05/13 16:37:01  mark
#  * let "keywords" be a primary field
#  * translate "Author e-mail Address" into "e-mail"
#
# Revision 1.26  1999/05/13 16:30:38  mark
# handle unknown fields; use "Author Keywords" as "keywords" field
#
# Revision 1.25  1999/05/13 14:43:24  mark
# pick pages and optional note from the title of inbook's
#
# Revision 1.24  1999/05/13 13:52:19  mark
#  * put name formatting in a function;
#  * format editors like authors
#  * put booktitle etc in primary fields
#
# Revision 1.23  1999/05/13 13:26:06  mark
# handle "inbook" entries
#
# Revision 1.22  1999/05/13 12:42:03  mark
# fixed author name splitting regexp
#
# Revision 1.21  1999/05/13 12:21:06  mark
# handle institution like abstract (needed bug fix in
# recognition of empty lines)
#
# Revision 1.20  1999/05/13 12:02:49  mark
# fixed bug in abstract handling, and cleaned up
#
# Revision 1.19  1999/05/13 11:50:01  mark
# implemented c, o, a options and reorganized a bit
#

# packages;
use FileHandle;   # for selecting formats
use Getopt::Std;      # command line option handling

# settings:
$: = " \n";    # don't break format ^ on hyphens, as is default
$| = 1;      # set autoflush (always fflush after print/write)

# define the flags (see above in Usage)
getopts('coa') or die $usage;

# get filename arguments
($infile, $outfile) = @ARGV;
if($#ARGV<0 || $#ARGV>1 ){
    die $usage;
}
if(!$outfile){
    $outfile = "$infile.bib";
}

open INFILE, $infile or die "Can't open input file \"$infile\"";
open OUTFILE, ">$outfile"  or die "Can't open output file \"$outfile\"";

$field = "unknown";
$count = 0;

while(<INFILE>){
    if(/^<([0-9]+)>/){   # new entry
	$nr = $1;
	$count++;
	if($count>1){ &output_entry; } # output previous entry
	%entry = ();      	# start new entry
	print "\n<$nr> ";
    }
    elsif (/^Unique Identifier/){ $field = "unique-id"; }
    elsif (/^Authors/)          { $field = "author"; }
    elsif (/^Institution/)      { $field = "institution"; }
    elsif (/^Title/)            { $field = "title"; }
    elsif (/^Source/)           { $field = "source"; }
    elsif (/^Abbreviated Source/){    # overrides Source
	$field = "source";
	$entry{$field} = "";
    }
    elsif (/^Abstract/){              $field = "abstract"; }
    elsif (/^Author Keywords/){       $field = "keywords"; }
    elsif (/^Author e-mail Address/){ $field = "e-mail"; }
    elsif (/^(\w.*)/){     # unknown field
	$field = $1;
	$field =~ tr/[A-Z ]/[a-z_]/;  # lowercase and no spaces
    }
    elsif (/^\s*$/){  # empty line: append to field, but kill extra space
	$entry{$field} .= "\n";
    }                      # order of these two is important!
    elsif (/^\s\s(.+)/){   # nonempty line starting with two spaces:
	$entry{$field} .= "$1\n";     # append to a field
    }
    else{ print "can't parse line: \"$_\""; }
    
    if($opt_c){       # copy original as comment
	print OUTFILE "% ";
	s/@/\\AT /g;     # BibTeX doesn't like at-signs in comments
	print OUTFILE;
    }
}
&output_entry;      # last entry

print "\n";
close INFILE;
close OUTFILE;

# end of main program

########################################################################

#
# Format names (authors, editors):
# Put a period-space after each initial, but take
# care of multiletter initials like "Th". Characters
# allowed in names are:
#
#    \w (word)
#    \s (space)       as in "E. O. Robles de Medina"
#    -  (hyphen)      as in "F. J. Wilms-Schopman"
#    '  (right quote) as in "C. N. d'Alnoncourt"
#
# Translate "et al" into "others"; BibTeX recognizes that and
# translates back in the way specified in the bibliography style.
#
sub format_names(@names) {
    my @names = @_;
    my @authors = ();
    foreach $name (@names){
	if($name =~ /et al/){
	    push @authors, "others";   # BibTeX takes care of this
	}elsif($name =~ /^\s*$/){
	    # empty name (possibly newline char): ignore
	}else{
	    $name =~ /^\s*([\s\w-']+)\s+(\w+)$/;
	    $last = $1;
	    $inits = $2;
	    $inits =~ s/([A-Z][a-z]*)/$1\. /g;
	    push @authors, "$last, $inits";
	}
    }
    return @authors;
}

#######################################################################

#
# output one entry as a BibTeX @article or @incollection
# (@incollection is a part of a book having its own title, according
# to the manual [btxdoc.dvi], in contrast to @inbook).
#
sub output_entry {
    delete $entry{ignored};   # don't print the ignored fields

    foreach $field (keys %entry){ 
	chop $entry{$field};      # chop final newline
    }
    
    # try to convert source to journal etc.
    if($entry{source} =~ /(.*)\.\s+(.*):(.*),\s*(\w*)(\s[\w\-]*)?(\s.*)?\./){
	#
        # matches an article from MEDLINE
	#
	$bibtex_type = "article";
	$entry{journal} = $1;
	$entry{volume} = $2;   # volume and number!
	$entry{pages} = $3;
	$entry{year} = $4;
	$entry{month} = $5;
	$entry{date} = $6;
	# try to separate volume and number
	# the number may be absent, e.g. volume = "30 suppl."
	if($entry{volume} =~ /(.*)\((.*)\)/){
	    $entry{volume} = $1;
	    $entry{number} = $2;
	}
    }elsif($entry{source} =~ /(.*)\.\s+Vol\s+(.*)\s+\(pp (.*)\),\s+(.*)\./){
	#
        # matches an article from EMBASE
	#
	$bibtex_type = "article";
	$entry{journal} = $1;
	$entry{volume} = $2;   # volume and number!
	$entry{pages} = $3;
	$entry{year} = $4;
	# try to separate volume and number
	# the number may be absent, e.g. volume = "30 suppl."
	if($entry{volume} =~ /(.*)\((.*)\)/){
	    $entry{volume} = $1;
	    $entry{number} = $2;
	}
    }elsif($entry{source} =~
	   /In:\s*(.*),\sed\.\s+(.*)\.\s*(.*),\s*(.*),\s*([0-9]+)\.\s*(.*)$/){
	#
        # matches a chapter in a book
	#
	$bibtex_type = "incollection";
	$entry{editor} = $1;
	$entry{booktitle} = $2;
	$entry{address} = $3;
	$entry{publisher} = $4;
	$entry{year} = $5;
	$entry{locdata} = $6;
    }elsif($entry{source} =~
	   /In:\s*([^\.]*\.{1})\s*(.*),\s*(.*),\s*([0-9]+)\.\s*(.*)$/){
	#
        # matches a chapter in a book without named editor
	#
	$bibtex_type = "incollection";
	$entry{booktitle} = $1;
	$entry{address} = $2;
	$entry{publisher} = $3;
	$entry{year} = $4;
	$entry{locdata} = $5;
    }else{
	print "Can't parse source field. ";  # space at end!
    }

    # for incollections, the title field may contain pages and notes
    if($bibtex_type =~ /incollection/){
	$entry{title} =~ tr/\n/ /;
	if($entry{title} =~ /(.*)pp\.\s([0-9\-]+)\.\s*(.*)?$/){
	    print "incollection ";
	    $entry{title} = $1;
	    $entry{pages} = $2;
	    $entry{note} = $3;
	}else{
	    print "can't parse incollection title ";
	}
    }else{
	print "article ";
    }
    
    # fix up:
    if(exists($entry{pages})){
	$entry{pages} =~ s/-/--/g;          # page ranges...
    }
    if(exists($entry{month})){
	$entry{month} =~ s/\.//g;
	$entry{month} =~ s/-/--/g;
	$entry{month} =~ s/^\s*//g;   # remove initial space
    }
    $date = $entry{date};
    if(length($date)<1){ delete $entry{date}; }   # remove empty field
    if(exists($entry{date})){
	$entry{date} =~ s/^\s*//g;   # remove initial space
    }
    if(exists($entry{title})){
	$entry{title} =~ s/\.\s*$//;   # remove trailing period and space
    }
    if(exists($entry{booktitle})){
	$entry{booktitle} =~ s/\.\s*$//;   # ditto
    }

    #
    # Format authors:  Separate the names with
    # the string " and;", the semicolon is removed later.
    # In the ovid files, names are separated by periods.
    #
    if(exists($entry{author})){
	@names = split '\.', $entry{author};   # split on periods
	@authors = format_names(@names);
	$entry{author} = join " and;", (@authors);
    }
    
    # the same for editors
    if(exists($entry{editor})){
	@names = split ',', $entry{editor};   # split on commas!
	@editors = format_names(@names);
	$entry{editor} = join " and;", (@editors);
    }
    
    
    #
    # construct label from first author's last name and year
    #
    $tmp = $authors[0];
    if($tmp =~ /(.*),/){
	if(exists($entry{year})){
	    $label = "$1$entry{year}";
	}else{
	    print "Missing year. ";
	    $label = "$1:";
	}
	$label =~ tr/[A-Z]/[a-z]/;       # make lowercase
	$label =~ s/\s+//g;         # remove spaces like in "de Bakker"
    }else{
	print "Can't make label. ";
	$label = $entry{unique-id};
    }
    $Labelcount{$label}++;           # remember used labels
    if($Labelcount{$label}>1){            # already in use
	$label .= chr(95+$Labelcount{$label});     # a, b, ...
    }
    print "($label) ";

    #
    # Postprocessing of all fields:
    #    * handle TeX's special characters
    foreach $field (keys %entry){ 
	$entry{$field} =~ s/&/\\&{}/g;
	$entry{$field} =~ s/%/\\,\\%{}/g; 
	$entry{$field} =~ s/\$/\\\${}/g;
	$entry{$field} =~ s/\^/\\\^{}/g;
	$entry{$field} =~ s/#/\\#{}/g;
    }

    # Protect capitalized and all-caps words in title,
    # even the first word (may be all-caps...). We assume that ovid files
    # have sound capitalization.
    #
    $entry{title} =~ s/\b([\w\-]*[A-Z][\w\-]*\b)/\{$1\}/g;
    
    print OUTFILE "\@$bibtex_type" . "{$label,\n";    # start writing the entry

    # write the fields: known fields first, then see what's
    # left in another foreach loop.
    #
    foreach $field ('author','title','journal','booktitle',
		    'editor','publisher','address','year',
		    'volume','number','month','date','pages',
		    'keywords'){
	if(exists $entry{$field}){
	    write_field($field);
	    delete $entry{$field};
	}
    }
    if(exists $entry{abstract}){
	$field = "abstract";     # must set $field!
	write_field($field) if $opt_a;  # only if -a option given
	delete $entry{$field};
    }
    if($opt_o){                         # only if -o option given
	foreach $field (sort keys %entry){
	    write_field($field);
	}
    }

    print OUTFILE "}\n\n";    # finish entry
}

#
# write a single field; special handling for some fields
#
sub write_field($field) {
    my $field = shift;

    OUTFILE->format_name("Bibfield");    # default format
    
    if(($field =~ "month") && ($entry{$field} =~ /^...$/)){
	$value = "$entry{$field},";      # no braces for simple month
	$value =~ tr/[A-Z]/[a-z]/;       # make lowercase
	write OUTFILE;
    }elsif($field =~ /(author)|(editor)/){
        #
        # special formatting for authors/editors:
	# a special format is used, employing an array (@aut)
	# that is shift'ed to force one author on each line
	#
	$tmp = "{$entry{$field}},";      # attach braces and comma
	@aut = split ';', $tmp;        
	$first_author = shift @aut;
	OUTFILE->format_name("Bibfield_author");
	write OUTFILE;
    }elsif($field =~ /(abstract)|(institution)/){ # special formatting for
	$value = "{$entry{$field}},";             # multi-paragraph fields
	@abslines = split "\n\n", $value;
	$first_line = shift @abslines;
	OUTFILE->format_name("Bibfield_abstract_first");   # first paragraph
	write OUTFILE;
	OUTFILE->format_name("Bibfield_abstract");  # other paragraphs
	foreach $line (@abslines){
	    write OUTFILE;
	}
    }else{
	if(length($field) > 18){
	    OUTFILE->format_name("Bibfield_long");
	}
	$value = "{$entry{$field}},";   # attach braces and comma
	write OUTFILE;
    }
}

#
# format definitions for the write's above:
#
format Bibfield =
  @<<<<<<<<<<<<<<<<= ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  $field,            $value
                      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
                     $value
.

#
# for fieldnames longer than 18 characters
#    
format Bibfield_long =
  @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  $field,
                   =  ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
                      $value
                      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
                      $value
.

#
# A more compact format for abstracts and institutions;
# need two formats and a write for each paragraph to force newlines.
# Note that there's an empty line in the second format. Each `line' is in 
# fact a paragraph.
#
format Bibfield_abstract_first =
  @<<<<<<<<<<<<<<<<= ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  $field,            $first_line
                      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~
                     $first_line
      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
      $first_line
.

format Bibfield_abstract =

      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
                     $line
.

    
#
# format for authors
#
format Bibfield_author =
  @<<<<<<<<<<<<<<<<= ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  $field,            $first_author
                      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
                     $first_author
                      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
                     shift @aut
.

