summaryrefslogtreecommitdiff
path: root/converters/doc2html
diff options
context:
space:
mode:
authoragc <agc@pkgsrc.org>2001-05-14 14:03:20 +0000
committeragc <agc@pkgsrc.org>2001-05-14 14:03:20 +0000
commit9204924f9dc55716ae0b7c86089afe9369b07025 (patch)
tree21443ead9fdef073932ff126438e7e2241ba04a6 /converters/doc2html
parente870d601510254c33660b6b4855a3fbeec019da0 (diff)
downloadpkgsrc-9204924f9dc55716ae0b7c86089afe9369b07025.tar.gz
Initial import of doc2html-2.1 into the packages collection.
Provided in PR 12884 by Jesse Off (joff@newmonics.com) "External converter script for ht://Dig (version 3.1.4 and later), that converts Microsoft Word, Excel and Powerpoint files, and PDF, PostScript, RTF, and WordPerfect files to text (in HTML form) so they can be indexed. Uses a variety of conversion programs: wp2html - to convert Wordperfect and Word7 & 97 documents to HTML catdoc - to extract text from Word documents rtf2html - to convert RTF documents to HTML pdftotext - to extract text from Adobe PDFs ps2ascii - to extract text from PostScript pptHtml - to convert Powerpoint files to HTML xlHtml - to convert Excel spreadsheets to HTML or xls2csv - to obtain data from Excel spreadsheets. Written by David Adams (University of Southampton), and based on the conv_doc.pl script by Gilles Detillieux."
Diffstat (limited to 'converters/doc2html')
-rw-r--r--converters/doc2html/Makefile31
-rw-r--r--converters/doc2html/distinfo5
-rw-r--r--converters/doc2html/patches/patch-aa65
-rw-r--r--converters/doc2html/pkg/DESCR17
-rw-r--r--converters/doc2html/pkg/MESSAGE15
-rw-r--r--converters/doc2html/pkg/PLIST2
6 files changed, 135 insertions, 0 deletions
diff --git a/converters/doc2html/Makefile b/converters/doc2html/Makefile
new file mode 100644
index 00000000000..fa49d9d49e9
--- /dev/null
+++ b/converters/doc2html/Makefile
@@ -0,0 +1,31 @@
+# $NetBSD: Makefile,v 1.1.1.1 2001/05/14 14:03:20 agc Exp $
+#
+
+DISTNAME= doc2html
+PKGNAME= doc2html-2.1
+CATEGORIES= converters
+MASTER_SITES= http://www.htdig.org/files/contrib/parsers/
+
+MAINTAINER= packages@netbsd.org
+HOMEPAGE= http://www.htdig.org
+COMMENT= PERL external filter for htdig to convert numerous doc formats to HTML
+
+DEPENDS+= xlHtml-*:../../converters/xlHtml
+DEPENDS+= rtf2html-*:../../converters/rtf2html
+DEPENDS+= ghostscript{,-nox11}-[6-9]*:../../print/ghostscript
+DEPENDS+= xpdf-*:../../graphics/xpdf
+DEPENDS+= catdoc-*:../../textproc/catdoc
+
+USE_PERL5= YES
+
+do-build:
+ ${SED} -e "s%@@LOCALBASE@@%${LOCALBASE}%g" \
+ -e "s%/usr/bin/perl%${PERL5}%g" \
+ -e "s%/bin/sed%${SED}%g" \
+ -e "s%@@X11BASE@@%${X11BASE}%g" < ${WRKSRC}/doc2html.pl \
+ > ${WRKSRC}/doc2html
+
+do-install:
+ ${INSTALL_SCRIPT} ${WRKSRC}/doc2html ${PREFIX}/bin/doc2html
+
+.include "../../mk/bsd.pkg.mk"
diff --git a/converters/doc2html/distinfo b/converters/doc2html/distinfo
new file mode 100644
index 00000000000..d1fb850c8b7
--- /dev/null
+++ b/converters/doc2html/distinfo
@@ -0,0 +1,5 @@
+$NetBSD: distinfo,v 1.1.1.1 2001/05/14 14:03:20 agc Exp $
+
+SHA1 (doc2html.tar.gz) = 78f78950a87f9134dc871d68e897857c7fa76dbc
+Size (doc2html.tar.gz) = 12444 bytes
+SHA1 (patch-aa) = 86ca749c41251845b06bab5fe59cdddcde01ab63
diff --git a/converters/doc2html/patches/patch-aa b/converters/doc2html/patches/patch-aa
new file mode 100644
index 00000000000..1bd2b4476b2
--- /dev/null
+++ b/converters/doc2html/patches/patch-aa
@@ -0,0 +1,65 @@
+$NetBSD: patch-aa,v 1.1.1.1 2001/05/14 14:03:20 agc Exp $
+
+--- doc2html.pl.orig Mon Sep 11 05:29:20 2000
++++ doc2html.pl
+@@ -20,48 +20,48 @@
+ # If all else fails, attempts to read file without conversion.
+
+ # wp2html binary
+-$WP2HTML = "/opt/local/wp2html-3.2/bin/wp2html";
++$WP2HTML = "";
+
+ # rtf2html converts Rich Text Font documents to HTML
+ # (get it from: http://www.res.bbsrc.ac.uk/wp2html/):
+-$RTF2HTML = "/opt/local/rtf2html-1.1/bin/rtf2html";
++$RTF2HTML = "@@LOCALBASE@@/bin/rtf2html";
+
+ # Catdoc converts MS Word to plain text
+ # (get it from: http://www.fe.msk.ru/~vitus/catdoc/):
+
+ #version of catdoc for Word6, Word7 & Word97 files:
+-$CATDOC = "/opt/local/catdoc-0.91.4/bin/catdoc";
++$CATDOC = "@@LOCALBASE@@/bin/catdoc";
+
+ #version of catdoc for Word2 files:
+-$CATDOC2 = "/opt/local/catdoc-0.91.4/bin/catdoc";
++$CATDOC2 = "@@LOCALBASE@@/bin/catdoc";
+
+ #version of catdoc for Word 5.1 for MAC:
+-$CATDOCM = "/opt/local/catdoc-0.91.4/bin/catdoc";
++$CATDOCM = "@@LOCALBASE@@/bin/catdoc";
+
+ # PostScript to text converter
+ # (get it from the ghostscript 3.33 (or later) package):
+-$CATPS = "/usr/freeware/bin/ps2ascii";
++$CATPS = "@@LOCALBASE@@/bin/ps2ascii";
+
+ # add to search path the directory which contains gs:
+-$ENV{PATH} .= ":/usr/freeware/bin";
++# $ENV{PATH} .= ":/usr/freeware/bin";
+
+ # PDF to text converter and pdfinfo tool
+ # (get them from the xpdf package at http://www.foolabs.com/xpdf/):
+-$CATPDF = "/opt/local/xpdf-0.9/bin/pdftotext";
+-$PDFINFO = "/opt/local/xpdf-0.9/bin/pdfinfo";
++$CATPDF = "@@X11BASE@@/bin/pdftotext";
++$PDFINFO = "@@X11BASE@@/bin/pdfinfo";
+
+ #Microsoft Excel to HTML converter
+ # (get it from www.xlHtml.org)
+-$XLS2HTML = "/opt/local/xlHtml-0.2.7.2/bin/xlHtml";
++$XLS2HTML = "@@LOCALBASE@@/bin/xlHtml";
+
+ #Microsoft Powerpoint to HTML converter
+ # (get it from www.xlHtml.org)
+-$PPT2HTML = "/opt/local/xlHtml-0.2.7.2/bin/pptHtml";
++$PPT2HTML = "@@LOCALBASE@@/bin/pptHtml";
+
+ #MicroSoft Excel to .CSV converter
+ # (you don't need this if you have xlHtml)
+ # (get it with catdoc)
+-$CATXLS = "/opt/local/catdoc-0.91.2/bin/xls2csv";
++$CATXLS = "";
+
+ ########################################################################################
+ # Written by David Adams <d.j.adams@soton.ac.uk>.
diff --git a/converters/doc2html/pkg/DESCR b/converters/doc2html/pkg/DESCR
new file mode 100644
index 00000000000..4e1130a0505
--- /dev/null
+++ b/converters/doc2html/pkg/DESCR
@@ -0,0 +1,17 @@
+External converter script for ht://Dig (version 3.1.4 and later), that
+converts Microsoft Word, Excel and Powerpoint files, and PDF,
+PostScript, RTF, and WordPerfect files to text (in HTML form) so they
+can be indexed. Uses a variety of conversion programs:
+
+ wp2html - to convert Wordperfect and Word7 & 97 documents to HTML
+ catdoc - to extract text from Word documents
+ rtf2html - to convert RTF documents to HTML
+ pdftotext - to extract text from Adobe PDFs
+ ps2ascii - to extract text from PostScript
+ pptHtml - to convert Powerpoint files to HTML
+ xlHtml - to convert Excel spreadsheets to HTML
+or
+ xls2csv - to obtain data from Excel spreadsheets.
+
+Written by David Adams (University of Southampton), and based on the
+conv_doc.pl script by Gilles Detillieux.
diff --git a/converters/doc2html/pkg/MESSAGE b/converters/doc2html/pkg/MESSAGE
new file mode 100644
index 00000000000..2092040533e
--- /dev/null
+++ b/converters/doc2html/pkg/MESSAGE
@@ -0,0 +1,15 @@
+==========================================================================
+$NetBSD: MESSAGE,v 1.1.1.1 2001/05/14 14:03:20 agc Exp $
+
+To use with htdig, add the following to your htdig.conf file:
+
+external_parsers: application/rtf->text/html ${PREFIX}/bin/doc2html \
+ text/rtf->text/html ${PREFIX}/bin/doc2html \
+ application/pdf->text/html ${PREFIX}/bin/doc2html \
+ application/postscript->text/html ${PREFIX}/bin/doc2html \
+ application/msword->text/html ${PREFIX}/bin/doc2html \
+ application/msexcel->text/html ${PREFIX}/bin/doc2html \
+ application/vnd.ms-excel->text/html ${PREFIX}/bin/doc2html \
+ application/vnd.ms-powerpoint->text/html ${PREFIX}/bin/doc2html
+
+==========================================================================
diff --git a/converters/doc2html/pkg/PLIST b/converters/doc2html/pkg/PLIST
new file mode 100644
index 00000000000..d2928613c09
--- /dev/null
+++ b/converters/doc2html/pkg/PLIST
@@ -0,0 +1,2 @@
+@comment $NetBSD: PLIST,v 1.1.1.1 2001/05/14 14:03:20 agc Exp $
+bin/doc2html