summaryrefslogtreecommitdiff
path: root/biology
diff options
context:
space:
mode:
authorbrook <brook@pkgsrc.org>2021-05-27 17:11:42 +0000
committerbrook <brook@pkgsrc.org>2021-05-27 17:11:42 +0000
commitb58683de491832132c9508713237bfd139b01a20 (patch)
tree618e569c9b743666f7e3a94bb22cbabe9b11014d /biology
parentc243c2cb4a98bff10432989297099823b4253610 (diff)
downloadpkgsrc-b58683de491832132c9508713237bfd139b01a20.tar.gz
biology/filter-fastq: add filter-fastq version 0.0.0.20210527
Filter reads from a FASTQ file using a list of identifiers. Each entry in the input FASTQ file (or files) is checked against all entries in the identifier list. Matches are included by default, or excluded if the --invert flag is supplied. Paired-end files are kept consistent (in order). This is almost certainly not the most efficient way to implement this filtering procedure. I tested a few different strategies and this one seemed the fastest. Current timing with 16 processes is about 10 minutes per 1M paired reads with gzip'd input and output, depending on the length of the identifier list to filter by. usage: filter_fastq.py [-h] [-i INPUT] [-1 READ1] [-2 READ2] [-p NUM_THREADS] [-o OUTPUT] [-f FILTER_FILE] [-v] [--gzip]
Diffstat (limited to 'biology')
-rw-r--r--biology/filter-fastq/DESCR15
-rw-r--r--biology/filter-fastq/Makefile32
-rw-r--r--biology/filter-fastq/PLIST3
-rw-r--r--biology/filter-fastq/distinfo6
4 files changed, 56 insertions, 0 deletions
diff --git a/biology/filter-fastq/DESCR b/biology/filter-fastq/DESCR
new file mode 100644
index 00000000000..64f6ecc3f7a
--- /dev/null
+++ b/biology/filter-fastq/DESCR
@@ -0,0 +1,15 @@
+Filter reads from a FASTQ file using a list of identifiers.
+
+Each entry in the input FASTQ file (or files) is checked against all
+entries in the identifier list. Matches are included by default, or
+excluded if the --invert flag is supplied. Paired-end files are kept
+consistent (in order).
+
+This is almost certainly not the most efficient way to implement this
+filtering procedure. I tested a few different strategies and this one
+seemed the fastest. Current timing with 16 processes is about 10
+minutes per 1M paired reads with gzip'd input and output, depending on
+the length of the identifier list to filter by.
+
+usage: filter_fastq.py [-h] [-i INPUT] [-1 READ1] [-2 READ2] [-p NUM_THREADS]
+ [-o OUTPUT] [-f FILTER_FILE] [-v] [--gzip]
diff --git a/biology/filter-fastq/Makefile b/biology/filter-fastq/Makefile
new file mode 100644
index 00000000000..14fa8a995b7
--- /dev/null
+++ b/biology/filter-fastq/Makefile
@@ -0,0 +1,32 @@
+# $NetBSD: Makefile,v 1.1 2021/05/27 17:11:42 brook Exp $
+
+PKGNAME= filter-fastq-0.0.0.20210527
+GITHUB_PROJECT= filter-fastq
+GITHUB_TAG= d2c9218
+DISTNAME= filter-fastq
+CATEGORIES= biology
+MASTER_SITES= ${MASTER_SITE_GITHUB:=stephenfloor/}
+EXTRACT_SUFX= .zip
+DIST_SUBDIR= ${GITHUB_PROJECT}
+
+MAINTAINER= pkgsrc-users@NetBSD.org
+HOMEPAGE= https://github.com/stephenfloor/filter-fastq/
+COMMENT= Filter reads from a FASTQ file
+LICENSE= mit
+
+WRKSRC= ${WRKDIR}/filter-fastq-d2c92182674a6d5aa257fb63eb60ac24ddb8b4a0
+USE_LANGUAGES= # none
+NO_BUILD= yes
+
+PYTHON_VERSIONS_ACCEPTED= 27
+
+REPLACE_PYTHON+= filter_fastq.py
+
+INSTALLATION_DIRS+= bin share/doc/filter_fastq
+
+do-install:
+ ${INSTALL_SCRIPT} ${WRKSRC}/filter_fastq.py ${DESTDIR}${PREFIX}/bin
+ ${INSTALL_DATA} ${WRKSRC}/README.md ${DESTDIR}${PREFIX}/share/doc/filter_fastq
+
+.include "../../lang/python/application.mk"
+.include "../../mk/bsd.pkg.mk"
diff --git a/biology/filter-fastq/PLIST b/biology/filter-fastq/PLIST
new file mode 100644
index 00000000000..ef6a4a260a1
--- /dev/null
+++ b/biology/filter-fastq/PLIST
@@ -0,0 +1,3 @@
+@comment $NetBSD: PLIST,v 1.1 2021/05/27 17:11:42 brook Exp $
+bin/filter_fastq.py
+share/doc/filter_fastq/README.md
diff --git a/biology/filter-fastq/distinfo b/biology/filter-fastq/distinfo
new file mode 100644
index 00000000000..1e7b2f1cd90
--- /dev/null
+++ b/biology/filter-fastq/distinfo
@@ -0,0 +1,6 @@
+$NetBSD: distinfo,v 1.1 2021/05/27 17:11:42 brook Exp $
+
+SHA1 (filter-fastq/filter-fastq-d2c9218.zip) = 44b8bbef2690b598a2f06930396fbbf5828e364c
+RMD160 (filter-fastq/filter-fastq-d2c9218.zip) = 715b0e52b5714cea1fa4a64bfe8cbef919cee2ce
+SHA512 (filter-fastq/filter-fastq-d2c9218.zip) = c5ab23b86ac8690f58bf05bd0a16f3b315bd7a71f67bce267fe9f36b5e528ac228c57c2521cad8c547159915cf77433848be58d463100f407693927493ad8f5f
+Size (filter-fastq/filter-fastq-d2c9218.zip) = 4249 bytes