diff options
author | wiz <wiz@pkgsrc.org> | 2021-11-24 15:56:18 +0000 |
---|---|---|
committer | wiz <wiz@pkgsrc.org> | 2021-11-24 15:56:18 +0000 |
commit | 0c08dad21381a5fa7d9a96599991ec3c81531fdf (patch) | |
tree | 6ae4753c3ba21c92a7be6ad7b14dfce37125b986 /meta-pkgs | |
parent | 72f5c698c813ca11dba6e42053a8fbf39303440e (diff) | |
download | pkgsrc-0c08dad21381a5fa7d9a96599991ec3c81531fdf.tar.gz |
nltk_data: add shared files for nltk_data packages
This also includes a tool to create these packages.
Diffstat (limited to 'meta-pkgs')
-rw-r--r-- | meta-pkgs/nltk_data/common.mk | 24 | ||||
-rw-r--r-- | meta-pkgs/nltk_data/howto.md | 21 | ||||
-rwxr-xr-x | meta-pkgs/nltk_data/split.py | 49 |
3 files changed, 94 insertions, 0 deletions
diff --git a/meta-pkgs/nltk_data/common.mk b/meta-pkgs/nltk_data/common.mk new file mode 100644 index 00000000000..7ba3fc8d732 --- /dev/null +++ b/meta-pkgs/nltk_data/common.mk @@ -0,0 +1,24 @@ +# $NetBSD: common.mk,v 1.1 2021/11/24 15:56:18 wiz Exp $ + +MASTER_SITES= https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${TYPE}/ +EXTRACT_SUFX?= .zip + +MAINTAINER?= pkgsrc-users@NetBSD.org +HOMEPAGE?= https://www.nltk.org/data.html +COMMENT?= Natural Language Toolkit (NLTK) Data + +INSTALLATION_DIRS+= share/nltk_data/${TYPE} + +UNPACK?= no + +do-build: + +.if ${UNPACK} == "no" +do-install: + ${INSTALL_DATA} ${_DISTDIR}/${DISTNAME}${EXTRACT_SUFX} ${DESTDIR}${PREFIX}/share/nltk_data/${TYPE} +.else +USE_TOOLS+= pax + +do-install: + cd ${WRKDIR} && ${PAX} -pp -rw ${DISTNAME} ${DESTDIR}${PREFIX}/share/nltk_data/${TYPE}/ +.endif diff --git a/meta-pkgs/nltk_data/howto.md b/meta-pkgs/nltk_data/howto.md new file mode 100644 index 00000000000..0f31fe0ab85 --- /dev/null +++ b/meta-pkgs/nltk_data/howto.md @@ -0,0 +1,21 @@ +# Sources + +Fetch https://www.nltk.org/nltk_data/ which is an XML file with an XSL +stylesheet + + wget -O nltk_data.xml https://www.nltk.org/nltk_data/ + +should work. +This file contains one line per data, as of 2021-11-24 there are 108 entries, +and some meta package information. + +# Generating the packages + +Update the date in `split.py` and run it: + + split.py + +It will generate one package for each entry in the list in textproc/nltk_data-${id} +You'll then need to run 'make mdi' in each directory. If the package existed +before, make sure that the data really changed (distinfo checksums/size differ) +before committing. diff --git a/meta-pkgs/nltk_data/split.py b/meta-pkgs/nltk_data/split.py new file mode 100755 index 00000000000..0213c11af39 --- /dev/null +++ b/meta-pkgs/nltk_data/split.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +import os +import xml.etree.ElementTree as ET + +tree = ET.parse('nltk_data.xml') + +root = tree.getroot() + +for child in root[0]: + id = child.attrib["id"] + path = f"/usr/pkgsrc/textproc/nltk_data-{id}" + try: + os.mkdir(path) + except Exception: + pass + name = child.attrib["name"] + if "webpage" in child.attrib: + webpage = "HOMEPAGE=\t" + child.attrib["webpage"] + else: + webpage = "" + if "license" in child.attrib: + license = child.attrib["license"] + subdir = child.attrib["subdir"] + url = child.attrib["url"] + with open(path + "/Makefile", "w") as f: + print(f"""# $NetBSD: split.py,v 1.1 2021/11/24 15:56:18 wiz Exp $ + +DISTNAME= {id} +PKGNAME= nltk_data-{id}-20211124 +CATEGORIES= textproc +DIST_SUBDIR= ${{PKGNAME_NOREV}} + +{webpage} +COMMENT= NLTK Data - {name} +#LICENSE= {license} + +TYPE= {subdir} + +.include "../../meta-pkgs/nltk_data/common.mk" +.include "../../mk/bsd.pkg.mk" +""", file=f, end='') + with open(path + "/DESCR", "w") as f: + print(f"""This package contains data for NLTK, the Natural Language Toolkit. + +This package contains data from/for {name}.""", file=f) + with open(path + "/PLIST", "w") as f: + print(f"""@comment $NetBSD: split.py,v 1.1 2021/11/24 15:56:18 wiz Exp $ +share/nltk/{subdir}/{id}.zip""", file=f) |