#!/usr/bin/perl
#
# $Id: bibkeys,v 1.22 2002/08/31 20:00:39 potse Exp $
#
# Adds cite keys to a .bib file, and removes duplicates (= same
# authors, title, and journal).
#
# This is useful with .bib files generated by Reference Manager using
# bibtex.os, which may not have useful cite keys.
#
# This program uses the Text::BibTeX Perl module
#

$usage = "\n  USAGE:  $0 infile outfile\n\n";

use Text::BibTeX;

$fi = shift  or die($usage);
$fo = shift  or die($usage);

open FO, ">$fo"  or die "cannot open output file";
$date = `date`;
$Revision = "(version";
$noisiveR = ")";
print FO "% bibkey $Revision: 1.22 $noisiveR output;  $date\n\n"
    or die "cannot open output file";

$infile = new Text::BibTeX::File "$fi";

# #
# # either use this or use write_entry below
# #
# close FO;
# $outfile = new Text::BibTeX::File ">$fo";


$totrecs = 0;
while ($entry = new Text::BibTeX::Entry $infile){
    $totrecs++;

    #
    # unpack the record and generate components for citekey
    #
    warn "error in input" unless $entry->parse_ok;
    ($author, $year) = $entry->get('author','year');
    $old_key = $entry->key;          # cannot get these with ->get
    $type    = $entry->type;  
    @authors =  $entry->names('author');
    @lparts = $authors[0]->part('last');  # last name of first author
    @vparts = $authors[0]->part('von');  # von name of first author
    $name = $lparts[0];           # first part of last name, used for key
    ($name, $rem) = split(' ', $name);    # make sure there are no spaces in it
    $name =~ s/[{}]//g;           # and remove braces
    $flname = join(' ', (@vparts, @lparts));  # print name

    #
    # remove trailing period from title field
    #
    $title = $entry->get('title');
    if($title =~ /(.*)\.$/){ $entry->set('title', $1); }

    #
    # create a unique identification of each entry
    #
    $ident = "";
    foreach $a (@authors){
	$ident .= join '', $a->part('first');
	$ident .= join '', $a->part('von');
	$ident .= join '', $a->part('last');
	$ident .= join '', $a->part('jr');
    }
    $ident .= $title . $entry->get('journal');
    $ident =~ s/\s//g;    # ignore spaces
    
    # print "$ident\n";

    if($Handled{$ident}){   # skip duplicates
	$dk = $Handled{$ident};
	print " duplicate of [$dk]\n";
	next;
    }
    
    #
    # determine the new citekey, and add bibkeys specific fields
    #
    if(length($name)>7){
	$key = lc sprintf("%.4s%d", $name, $year);
    }else{
	$key = lc sprintf("%s%d", $name, $year);
    }
    if($Keys{$key}){
	$citekey = $key . chr($Keys{$key}+96);   # make it unique
    }else{
	$citekey = $key;
    }
    $Keys{$key}++;
    # $entry->set('bibkeys_nr', $totrecs, 'bibkeys_ok', $old_key);
    $entry->set_key($citekey);           # cannot set this with ->set

    #
    # For duplicate recognition. Could store anything as a value;
    # only the key matters, but it may come in handy later.
    #
    $Handled{$ident} = $citekey;
    
    
    #
    # remove unneccessary fields
    #
    $entry->delete('note', 'keywords');        # don't need these
    $entry->delete('reftype');    # RefMan/bibtex.os debugging info

    #
    # insert/protect spaces in journal name
    #
    if($entry->exists('journal')){
	$journal = $entry->get('journal');
	$journal =~ s/\.([A-z])/\.\\ \1/g;   # not at end!
	$journal =~ s/\s+/ /g;
	$entry->set('journal', $journal);
    }
    
    #
    # tell what we did
    #
    print sprintf "%3d.  %-14s  $Keys{$key}  $type by $flname\n", $totrecs, "[$citekey]";

    #
    # pack and write
    #
    #    $entry->write($outfile);    # Text::BibTeX method
    #
    write_entry($entry);
}


############# end of main program ########################################

##
## functions
##

sub write_entry($entry) {
    
    $type = $entry->type;
    $key = $entry->key;
    print FO "\@$type" . "{$key,\n";
    
    foreach $f ($entry->fieldlist){
	write_field($f, $entry->get($f));
    }
    print FO "}\n\n";    # finish entry    
}


#
# write a single field; special handling for some fields
#
# Variables used by the formats start with $fmt_
#
sub write_field($field, $value) {
    my $field = shift;
    my $value = shift;
    
    $fmt_field = $field;   # used by format

    FO->format_name("Bibfield");    # default format
    
    if(($field =~ "month") && ($value =~ /^...$/)){
	$fmt_value = "$value,";      # no braces for simple month
	$fmt_value =~ tr/[A-Z]/[a-z]/;       # make lowercase
	write FO;
    }elsif($field =~ /(author)|(editor)/){
        #
        # special formatting for authors/editors:
	# a special format is used, employing an array (@fmt_aut)
	# that is shift'ed to force one author on each line
	#
	@authors =  $entry->split ('author');
	$tmp = join ' and;', @authors;    # re-insert 'and'
        @fmt_aut = split ';', "{$tmp},";
	$fmt_first_author = shift @fmt_aut;
	FO->format_name("Bibfield_author");
	write FO;
    }elsif($field =~ /(abstract)|(institution)/){ # special formatting for
	$fmt_value = "{$value},";             # multi-paragraph fields
	@abslines = split "\n\n", $fmt_value;
	$fmt_firstline = shift @abslines;
	FO->format_name("Bibfield_abstract_first");   # first paragraph
	write FO;
	FO->format_name("Bibfield_abstract");  # other paragraphs
	foreach $fmt_line (@abslines){
	    write FO;
	}
    }else{
	if(length($field) > 18){
	    FO->format_name("Bibfield_long");
	}
	$fmt_value = "{$value},";   # attach braces and comma
	write FO;
    }
}

#
# format definitions for the write's above:
#
format Bibfield =
  @<<<<<<<<<<<<<<<<= ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  $fmt_field,        $fmt_value
                      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
                     $fmt_value
.

#
# for fieldnames longer than 18 characters
#    
format Bibfield_long =
  @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  $fmt_field,
                   =  ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
                      $fmt_value
                      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
                      $fmt_value
.

#
# A more compact format for abstracts and institutions;
# need two formats and a write for each paragraph to force newlines.
# Note that there's an empty line in the second format. Each `line' is in 
# fact a paragraph.
#
format Bibfield_abstract_first =
  @<<<<<<<<<<<<<<<<= ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  $fmt_field,            $fmt_firstline
                      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~
                     $fmt_firstline
      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
      $fmt_firstline
.

format Bibfield_abstract =

      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
                     $fmt_line
.

    
#
# format for authors
#
format Bibfield_author =
  @<<<<<<<<<<<<<<<<= ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  $fmt_field,        $fmt_first_author
                      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
                     $fmt_first_author
                      ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
                     shift @fmt_aut
.

