#/bin/zsh

# mrcheckbib - version DBL_MIN
# checks a bibtex file (.bib) against the AMS MRef database
#
# Copyright (C) 2006 John Cagnol <john@cagnol.net>
# University Leonard de Vinci, Paris, France
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.  This document can
# be obtained at http://www.gnu.org/licenses/gpl.txt
#
# Usage:
#   mrcheckbib basename
# 
# Where:
#   basename.bib is the bibtex file to be checked
#
# Output:
#   result is placed in file-mrefed.bib and log is placed in
#   file-mrefed.log 
#
# Example:
#   mrcheckbib references
#
#   checks file references.bib against MRef and places the result in
#   references-mrefed.bib  
#
# Purpose of the script:
#
# This script takes every entry in a bibtex file (.bib) and checks it
# against the AMS MRef database (see http://www.ams.org/mref).  If it
# finds a unique MRef entry, then your entry will be replaced by the
# MRef one. For example, if your bibtex entry is
# 
#   @article{greens,
#   Author="Cagnol, J. and Lebiedzik, C.},  
#   Title="On the free boundary conditions for a dynamic shell model
#   based on intrinsic differential geometry
#   Journal="Applicable Analysis", Year=2004}
#
# It will be replaced by
#
#   @article {greens,
#       AUTHOR = {Cagnol, John and Lebiedzik, Catherine},
#        TITLE = {On the free boundary conditions for a dynamic shell
#                 model based on intrinsic differential geometry},
#      JOURNAL = {Appl. Anal.},
#     FJOURNAL = {Applicable Analysis. An International Journal},
#       VOLUME = {83},
#         YEAR = {2004},
#       NUMBER = {6},
#        PAGES = {607--633},
#         ISSN = {0003-6811},
#      MRCLASS = {74K25 (35B35 53B50 74H99)},
#     MRNUMBER = {MR2059476 (2006c:74066)},
#   MRREVIEWER = {Liliana Gratie},
#   }
#
# The latter is more complete than the former (page numbers, etc.).
# Please note that mrcheckbib does not try to figure out which entry
# is more complete.  If a unique MRef entry is found, then your entry
# is gone.  In most cases, this is good because the MRef entry is more
# complete. 
#
# mrcheckbib is intended to be used with a standard bibfile, with no
# comment line.  It is assumed your shell is zsh and that have a
# non-interactive network retriever such as wget.  If you use curl
# or any other retriever, you'll need to adapt the options. 
#
# The script relies on the fact that MRef response is bracketed by
# <pre> tags.  If AMS changes this, then script will stop working.
#
# Please note that this script was written in haste, it was not
# intended to be portable, robust or well-written.  Version number is
# DBL_MIN, which is pretty low!  It was released because several people 
# asked me to, but it has not been tested extensively, and may require 
# fine tuning.  If you come up with a better version, please let me
# know. 


# on-interactive network retriever used
wget=wget
# if you change this, you'll need to change the options as well

# File names
inputfile=$1.bib
outputfile=$1-mrefed.bib
outputlog=$1-mrefed.log
tempfile=$1.$$

# Counters
refcount=0
okcount=0
numcount=0

# Messages
date=`date +"%d %b %Y"`
verified="% checked against MRef on $date"
unverified="% Could not find a unique match on MRef on $date";
weird="% reference verification failed on $date"; 

# Output files better not exist
if test -e $outputfile
then
  echo "$0: file $outputfile exists.  Aborting"
  exit;
fi

if test -e $outputlog
then
  echo "$0: file $outputlog exists.  Aborting"
  exit;
fi

# bib entries are placed one by line, tabulations removed, spaces
# replaced by @ 
cat $inputfile | tr -d \\n | tr -d \\t | tr @ \\n | sed s/\ /@/g > $tempfile

echo "% Generated by mrcheckbib version DBL_MIN on $date" >> $outputfile
echo "% For more information, please visit http://www.cagnol.com/mrcheckbib" >> $outputfile
echo " " >> $outputfile
echo " " >> $outputfile

# Processing of each entry
for entry in $(<$tempfile)
do
  # Entry info
  refcount=`expr $refcount + 1`
  reference=`echo $entry | sed s/@/%20/g`
  bibtype=`echo $reference | cut -d\{ -f1 | sed s/%20//g `
  bibtag=`echo $reference | cut -d\{ -f2 | cut -d\, -f1 | sed s/%20//g`
  echo "Checking reference $refcount ($bibtag, $bibtype)"
  echo "[$refcount, $bibtag, $bibtype]" >> $outputlog

  # fetch the MRef for the entry
  echo -n "connecting. "
  URL="http://www.ams.org/mathscinet-mref?&dataType=bibtex&ref="$reference
  # if you use curl, adapt the options below
  wget -a $outputlog -O $tempfile.$refcount $URL

  # figure out the start and the end of the entry (delimited by <pre> tags)
  start=`grep -n "<pre>" $tempfile.$refcount | cut -d\: -f1`
  end=`grep -n "</pre>" $tempfile.$refcount | cut -d\: -f1`
  echo "start=$start, end=$end" >> $outputlog

  # if <pre> tags were not found then the MRef failed for this entry
  if test "$start" == ""
  then
    result=1;
  else
      if test "$end" == ""
      then
        result=2;
      else
        result=0;
      fi
  fi 

  case $result in
  1) # No <pre> tag could be found, most likely because no unique match could be found
     echo "Not a unique match.";
     echo "Not a unique match." >> $outputlog;
     numcount=`expr $numcount + 1`;
     echo $unverified >> $outputfile;
     echo -n "@">> $outputfile;
     echo $entry | sed s/@/\ /g >> $outputfile;;

  2) # A <pre> tag could be found, but no </pre> tag, that's bizzare.
     echo "Internal error."; 
     echo "Internal error." >> $outputlog; 
     echo $weird >> $outputfile;     
     echo -n "@">> $outputfile;
     echo $entry | sed s/@/\ /g >> $outputfile;;

  0) # Tags <pre> and </pre> were found, MRef returned a result!
     echo "found. "
     echo "OK." >> $outputlog;
     okcount=`expr $okcount + 1`;

     # find were to cut
     length=`expr $end - $start - 1`;
     endm1=`expr $end - 1`;

     # get the entrytype from MRef
     newbibtype=`head -$start $tempfile.$refcount | tail -1 | cut -d\@ -f2 | cut -d\{ -f1 | sed s/\ //g`;

     # get the data from MRef
     head -$endm1 $tempfile.$refcount | tail -$length > $tempfile.$refcount.res;    
     echo $verified >> $outputfile;

     # Warn if entry types disagree
     if test "$bibtype" != "$newbibtype"
     then
       echo "discrepency: your is entry type was $bibtype, MRef is $newbibtype" >> $outputlog;
     fi

     # Output the result
     echo "@$newbibtype{$bibtag," >> $outputfile;
     cat $tempfile.$refcount.res >> $outputfile;
     echo "}" >> $outputfile;;
  esac

  echo " " >> $outputfile
  echo " " >> $outputfile

  # Get rid of temp files
  rm -f $tempfile.$refcount $tempfile.$refcount.res
done

# Get rid of temp files
rm -f $tempfile

# Print summary information
if test "$refcount" -ge 1
then
  echo $refcount references were checked
else
  echo "no entry found"
fi

if test "$okcount" -ge 1
then
  echo $okcount references were found
else
  echo "none of your references were found"
fi

if test "$numcount" -ge 1
then
  echo $numcount references could not be uniquely matched
fi

echo "done"
