summaryrefslogtreecommitdiff
path: root/meta-pkgs
diff options
context:
space:
mode:
authorwiz <wiz@pkgsrc.org>2021-11-24 15:56:18 +0000
committerwiz <wiz@pkgsrc.org>2021-11-24 15:56:18 +0000
commit0c08dad21381a5fa7d9a96599991ec3c81531fdf (patch)
tree6ae4753c3ba21c92a7be6ad7b14dfce37125b986 /meta-pkgs
parent72f5c698c813ca11dba6e42053a8fbf39303440e (diff)
downloadpkgsrc-0c08dad21381a5fa7d9a96599991ec3c81531fdf.tar.gz
nltk_data: add shared files for nltk_data packages
This also includes a tool to create these packages.
Diffstat (limited to 'meta-pkgs')
-rw-r--r--meta-pkgs/nltk_data/common.mk24
-rw-r--r--meta-pkgs/nltk_data/howto.md21
-rwxr-xr-xmeta-pkgs/nltk_data/split.py49
3 files changed, 94 insertions, 0 deletions
diff --git a/meta-pkgs/nltk_data/common.mk b/meta-pkgs/nltk_data/common.mk
new file mode 100644
index 00000000000..7ba3fc8d732
--- /dev/null
+++ b/meta-pkgs/nltk_data/common.mk
@@ -0,0 +1,24 @@
+# $NetBSD: common.mk,v 1.1 2021/11/24 15:56:18 wiz Exp $
+
+MASTER_SITES= https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${TYPE}/
+EXTRACT_SUFX?= .zip
+
+MAINTAINER?= pkgsrc-users@NetBSD.org
+HOMEPAGE?= https://www.nltk.org/data.html
+COMMENT?= Natural Language Toolkit (NLTK) Data
+
+INSTALLATION_DIRS+= share/nltk_data/${TYPE}
+
+UNPACK?= no
+
+do-build:
+
+.if ${UNPACK} == "no"
+do-install:
+ ${INSTALL_DATA} ${_DISTDIR}/${DISTNAME}${EXTRACT_SUFX} ${DESTDIR}${PREFIX}/share/nltk_data/${TYPE}
+.else
+USE_TOOLS+= pax
+
+do-install:
+ cd ${WRKDIR} && ${PAX} -pp -rw ${DISTNAME} ${DESTDIR}${PREFIX}/share/nltk_data/${TYPE}/
+.endif
diff --git a/meta-pkgs/nltk_data/howto.md b/meta-pkgs/nltk_data/howto.md
new file mode 100644
index 00000000000..0f31fe0ab85
--- /dev/null
+++ b/meta-pkgs/nltk_data/howto.md
@@ -0,0 +1,21 @@
+# Sources
+
+Fetch https://www.nltk.org/nltk_data/ which is an XML file with an XSL
+stylesheet
+
+ wget -O nltk_data.xml https://www.nltk.org/nltk_data/
+
+should work.
+This file contains one line per data, as of 2021-11-24 there are 108 entries,
+and some meta package information.
+
+# Generating the packages
+
+Update the date in `split.py` and run it:
+
+ split.py
+
+It will generate one package for each entry in the list in textproc/nltk_data-${id}
+You'll then need to run 'make mdi' in each directory. If the package existed
+before, make sure that the data really changed (distinfo checksums/size differ)
+before committing.
diff --git a/meta-pkgs/nltk_data/split.py b/meta-pkgs/nltk_data/split.py
new file mode 100755
index 00000000000..0213c11af39
--- /dev/null
+++ b/meta-pkgs/nltk_data/split.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+
+import os
+import xml.etree.ElementTree as ET
+
+tree = ET.parse('nltk_data.xml')
+
+root = tree.getroot()
+
+for child in root[0]:
+ id = child.attrib["id"]
+ path = f"/usr/pkgsrc/textproc/nltk_data-{id}"
+ try:
+ os.mkdir(path)
+ except Exception:
+ pass
+ name = child.attrib["name"]
+ if "webpage" in child.attrib:
+ webpage = "HOMEPAGE=\t" + child.attrib["webpage"]
+ else:
+ webpage = ""
+ if "license" in child.attrib:
+ license = child.attrib["license"]
+ subdir = child.attrib["subdir"]
+ url = child.attrib["url"]
+ with open(path + "/Makefile", "w") as f:
+ print(f"""# $NetBSD: split.py,v 1.1 2021/11/24 15:56:18 wiz Exp $
+
+DISTNAME= {id}
+PKGNAME= nltk_data-{id}-20211124
+CATEGORIES= textproc
+DIST_SUBDIR= ${{PKGNAME_NOREV}}
+
+{webpage}
+COMMENT= NLTK Data - {name}
+#LICENSE= {license}
+
+TYPE= {subdir}
+
+.include "../../meta-pkgs/nltk_data/common.mk"
+.include "../../mk/bsd.pkg.mk"
+""", file=f, end='')
+ with open(path + "/DESCR", "w") as f:
+ print(f"""This package contains data for NLTK, the Natural Language Toolkit.
+
+This package contains data from/for {name}.""", file=f)
+ with open(path + "/PLIST", "w") as f:
+ print(f"""@comment $NetBSD: split.py,v 1.1 2021/11/24 15:56:18 wiz Exp $
+share/nltk/{subdir}/{id}.zip""", file=f)