I spent a little time recently in figuring out how to compile various pages into a pdf book. The key to doing this, I found, is to use 'htmldoc' - which can be downloaded for various platforms easily.
Here is a summary of the scripts that I used.
make_pdf.sh controls the overall processing, it uses a list of files stored in 'chapters.txt' to determine the files to process.
#!/bin/sh
rm -f missing_files
rm -f image_files
i=0
while read CHAPTER
do
i=`expr $i + 1`
TMP=`echo $i | awk '{printf "tmp%04d.html", $1}'`
echo $TMP
./txttohtml.sh $CHAPTER | sed -f greek.sed > $TMP
if [ $i -eq "500" ]
then
break
fi
done < chapters.txt
htmldoc --book --linkstyle plain --toctitle "The Molecular Universe" -f output.pdf --no-title --headfootfont times --headfootsize 10 --charset iso-8859-7 --embedfonts --size letter tmp*.html --titlefile title.html
./make_index.sh output.pdf 12
# make a report on the current status to append to the book
WORDS=`wc output.txt | awk '{print $2}'`
MISSING=`wc missing_files | awk '{print $1}'`
IMAGES=`wc image_files | awk '{print $1}'`
echo "<pre>" > status.html
echo "PDF file created:" >> status.html
date >> status.html
echo "" >> status.html
echo "Current word total: " $WORDS >> status.html
echo "" >> status.html
echo "Number of image files that should be enlarged: " $MISSING >> status.html
echo "" >> status.html
echo "Missing file names follow: " >> status.html
cat missing_files >> status.html
echo "" >> status.html
echo "Current image file count: " $IMAGES >> status.html
echo "All image file names follow: " >> status.html
cat image_files >> status.html
echo "</pre>" >> status.html
htmldoc --webpage -f status.pdf --no-title --size letter status.html
pdftk A=output.pdf B=output.index.pdf C=status.pdf output output.pdf
rm -f output.pdf output.txt output.index.pdf output.data.txt status.html status.pdf
if [ -f missing_files ]
then
echo "THERE ARE " `wc missing_files | awk '{print $1}'` " MISSING FILES"
cat missing_files
fi
txttohtml.sh is a very crude script that converts the raw nanoblogger txt file into a crude html file which can be used as the input to the htmldoc processor.
#!/bin/sh
awk '{
if(match($0,"TITLE:")){
title=substr($0,7)
print "<h1> " title " </h1>"
}
if(match($0,"<blockquote>")){
print "<table border=\"1\" cellpadding=\"10\"><tr><td>"
getline
sub("Note:","<b>Note:</b>");
print $0
next
}
if(match($0,"</blockquote>")){
print "</td></tr></table>"
next
}
if(match($0,"BODY:")){
intext=1
next
}
if(!intext)next
if(match($0,"END-----")){
intext=0
exit
}
if(match($0,"<table class=\"image")){
imagetable=1
}
if(match($0,"\"left\"") && imagetable){
sub("\"left\"","\"center\"");
}
if(match($0,"\"right\"") && imagetable){
sub("\"right\"","\"center\"");
}
if(match($0,"<img src=")){
record=$0
sub("^.*<img src=","",record);
sub(" .*$","",record);
sub("^.*/","",record);
sub("\"","",record);
sub("_scale","",record);
largefile=record
largefile="../../../images/" largefile
located=0
# look for file with original extension
line=""
getline line < largefile
close largefile
if(length(line)>0){
largefile="\""largefile"\""
sub("\".*\"",largefile)
located=1
}
# look for file with .png extension
line=""
sub(".gif",".png",largefile)
getline line < largefile
close largefile
if(length(line)>0){
largefile="\""largefile"\""
sub("\".*\"",largefile)
located=1
}
if(!located){
print "COULD NOT LOCATE " record > "/dev/tty"
print record >> "missing_files"
}
print largefile >> "image_files"
}
if(match($0,"</table") && imagetable){
imagetable=0
print $0
print "<table width=\"100%\" summary=\"\"><tr><td> </td></tr></table>"
next
}
print $0
}' $1
make_index.sh comes from pdftk and uses two tools from the pdftk site to create a crude book-like index for the the book. I modified the make_index.sh script a little - so I am including it here.
#!/bin/sh
# make_index.sh, version 1.0
# usage: make_index.sh <PDF filename> <page window>
# requires: pdftk, kw_catcher, page_refs,
# pdftotext, enscript, ps2pdf
#
# by Ross Presser, Imtek.com
# adapted by Sid Steward
# http://www.pdfhacks.com/kw_index/
# modified somewhat from the original distributed version to correct
# problems encountered in initial testing
export PATH=/opt/local/bin:/opt/local/sbin:$PATH
LANG=C
fname=`basename $1 .pdf`
pdftk ${fname}.pdf dump_data output ${fname}.data.txt && \
sed 's/LowercaseRomanNumerals/DecimalArabicNumerals/' ${fname}.data.txt > j && \
mv j ${fname}.data.txt && \
pdftotext ${fname}.pdf ${fname}.txt && \
page_refs ${fname}.txt index-terms.dat ${fname}.data.txt \
| sed 's/PageLabelNumStyle://g' \
| enscript --columns 2 --font 'Times-Roman@10' \
--header '|Index' --header-font 'ArialBold@20' \
--margins 54:54:36:54 --word-wrap --output - \
| ps2pdf - ${fname}.index.pdf