Warning
This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.
Source code for galaxy.datatypes.mothur
"""
Mothur Metagenomics Datatypes
"""
import logging
import re
from typing import (
List,
Optional,
)
from galaxy.datatypes.data import Text
from galaxy.datatypes.metadata import MetadataElement
from galaxy.datatypes.protocols import (
DatasetProtocol,
HasMetadata,
)
from galaxy.datatypes.sniff import (
build_sniff_from_prefix,
FilePrefix,
get_headers,
iter_headers,
)
from galaxy.datatypes.tabular import Tabular
from galaxy.util import unicodify
log = logging.getLogger(__name__)
[docs]@build_sniff_from_prefix
class Otu(Text):
file_ext = "mothur.otu"
MetadataElement(name="columns", default=0, desc="Number of columns", readonly=True, visible=True, no_value=0)
MetadataElement(name="labels", default=[], desc="Label Names", readonly=True, visible=True, no_value=[])
MetadataElement(name="otulabels", default=[], desc="OTU Names", readonly=True, visible=True, no_value=[])
[docs] def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
"""
Set metadata for Otu files.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> from galaxy.util.bunch import Bunch
>>> dataset = Bunch()
>>> dataset.metadata = Bunch
>>> otu = Otu()
>>> dataset.get_file_name = lambda : get_test_fname( 'mothur_datatypetest_true.mothur.otu' )
>>> dataset.has_data = lambda: True
>>> otu.set_meta(dataset)
>>> dataset.metadata.columns
100
>>> len(dataset.metadata.labels) == 37
True
>>> len(dataset.metadata.otulabels) == 98
True
"""
super().set_meta(dataset, overwrite=overwrite, **kwd)
if dataset.has_data():
label_names = set()
otulabel_names = set()
ncols = 0
data_lines = 0
comment_lines = 0
headers = iter_headers(dataset.get_file_name(), sep="\t", count=-1)
first_line = get_headers(dataset.get_file_name(), sep="\t", count=1)
if first_line:
first_line = first_line[0]
# set otulabels
if len(first_line) > 2:
otulabel_names = first_line[2:]
# set label names and number of lines
for line in headers:
if len(line) >= 2 and not line[0].startswith("@"):
data_lines += 1
ncols = max(ncols, len(line))
label_names.add(line[0])
else:
comment_lines += 1
# Set the discovered metadata values for the dataset
dataset.metadata.data_lines = data_lines
dataset.metadata.columns = ncols
dataset.metadata.labels = sorted(label_names)
dataset.metadata.otulabels = sorted(otulabel_names)
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is otu (operational taxonomic unit) format
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.otu' )
>>> Otu().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.otu' )
>>> Otu().sniff( fname )
False
"""
headers = iter_headers(file_prefix, sep="\t")
count = 0
for line in headers:
if not line[0].startswith("@"):
if len(line) < 2:
return False
if count >= 1:
try:
check = int(line[1])
if check + 2 != len(line):
return False
except ValueError:
return False
count += 1
if count > 2:
return True
return False
[docs]class Sabund(Otu):
file_ext = "mothur.sabund"
[docs] def __init__(self, **kwd):
"""
http://www.mothur.org/wiki/Sabund_file
"""
super().__init__(**kwd)
[docs] def init_meta(self, dataset: HasMetadata, copy_from: Optional[HasMetadata] = None) -> None:
super().init_meta(dataset, copy_from=copy_from)
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is otu (operational taxonomic unit) format
label<TAB>count[<TAB>value(1..n)]
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.sabund' )
>>> Sabund().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.sabund' )
>>> Sabund().sniff( fname )
False
"""
headers = iter_headers(file_prefix, sep="\t")
count = 0
for line in headers:
if not line[0].startswith("@"):
if len(line) < 2:
return False
try:
check = int(line[1])
if check + 2 != len(line):
return False
for i in range(2, len(line)):
int(line[i])
except ValueError:
return False
count += 1
if count > 0:
return True
return False
[docs]class GroupAbund(Otu):
file_ext = "mothur.shared"
MetadataElement(name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[])
[docs] def init_meta(self, dataset: HasMetadata, copy_from: Optional[HasMetadata] = None) -> None:
super().init_meta(dataset, copy_from=copy_from)
[docs] def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, skip: Optional[int] = 1, **kwd) -> None:
super().set_meta(dataset, overwrite=overwrite, **kwd)
# See if file starts with header line
if dataset.has_data():
label_names = set()
group_names = set()
data_lines = 0
comment_lines = 0
ncols = 0
headers = iter_headers(dataset.get_file_name(), sep="\t", count=-1)
for line in headers:
if line[0] == "label" and line[1] == "Group":
skip = 1
comment_lines += 1
else:
skip = 0
data_lines += 1
ncols = max(ncols, len(line))
label_names.add(line[0])
group_names.add(line[1])
# Set the discovered metadata values for the dataset
dataset.metadata.data_lines = data_lines
dataset.metadata.columns = ncols
dataset.metadata.labels = sorted(label_names)
dataset.metadata.groups = sorted(group_names)
dataset.metadata.skip = skip
[docs] def sniff_prefix(self, file_prefix: FilePrefix, vals_are_int=False) -> bool:
"""
Determines whether the file is a otu (operational taxonomic unit)
Shared format
label<TAB>group<TAB>count[<TAB>value(1..n)]
The first line is column headings as of Mothur v 1.2
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.shared' )
>>> GroupAbund().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.shared' )
>>> GroupAbund().sniff( fname )
False
"""
headers = iter_headers(file_prefix, sep="\t")
count = 0
for line in headers:
if not line[0].startswith("@"):
if len(line) < 3:
return False
if count > 0 or line[0] != "label":
try:
check = int(line[2])
if check + 3 != len(line):
return False
for i in range(3, len(line)):
if vals_are_int:
int(line[i])
else:
float(line[i])
except ValueError:
return False
count += 1
if count > 1:
return True
return False
[docs]@build_sniff_from_prefix
class SecondaryStructureMap(Tabular):
file_ext = "mothur.map"
[docs] def __init__(self, **kwd):
"""Initialize secondary structure map datatype"""
super().__init__(**kwd)
self.column_names = ["Map"]
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is a secondary structure map format
A single column with an integer value which indicates the row that this
row maps to. Check to make sure if structMap[10] = 380 then
structMap[380] = 10 and vice versa.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.map' )
>>> SecondaryStructureMap().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.map' )
>>> SecondaryStructureMap().sniff( fname )
False
"""
headers = iter_headers(file_prefix, sep="\t")
line_num = 0
rowidxmap = {}
for line in headers:
line_num += 1
if len(line) > 1:
return False
try:
pointer = int(line[0])
if pointer > line_num:
rowidxmap[pointer] = line_num
elif pointer > 0 or line_num in rowidxmap:
if rowidxmap[line_num] != pointer:
return False
except (ValueError, KeyError):
return False
if line_num < 3:
return False
return True
[docs]class AlignCheck(Tabular):
file_ext = "mothur.align.check"
[docs] def __init__(self, **kwd):
"""Initialize AlignCheck datatype"""
super().__init__(**kwd)
self.column_names = ["name", "pound", "dash", "plus", "equal", "loop", "tilde", "total"]
self.column_types = ["str", "int", "int", "int", "int", "int", "int", "int"]
self.comment_lines = 1
[docs] def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
super().set_meta(dataset, overwrite=overwrite, **kwd)
dataset.metadata.column_names = self.column_names
dataset.metadata.column_types = self.column_types
dataset.metadata.comment_lines = self.comment_lines
if isinstance(dataset.metadata.data_lines, int):
dataset.metadata.data_lines -= self.comment_lines
[docs]class AlignReport(Tabular):
"""
QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template
AY457915 501 82283 1525 kmer 89.07 needleman 5 501 1 499 499 2 0 0 97.6
"""
file_ext = "mothur.align.report"
[docs] def __init__(self, **kwd):
"""Initialize AlignCheck datatype"""
super().__init__(**kwd)
self.column_names = [
"QueryName",
"QueryLength",
"TemplateName",
"TemplateLength",
"SearchMethod",
"SearchScore",
"AlignmentMethod",
"QueryStart",
"QueryEnd",
"TemplateStart",
"TemplateEnd",
"PairwiseAlignmentLength",
"GapsInQuery",
"GapsInTemplate",
"LongestInsert",
"SimBtwnQuery&Template",
]
[docs]class DistanceMatrix(Text):
file_ext = "mothur.dist"
MetadataElement(
name="sequence_count",
default=0,
desc="Number of sequences",
readonly=True,
visible=True,
optional=True,
no_value="?",
)
[docs] def init_meta(self, dataset: HasMetadata, copy_from: Optional[HasMetadata] = None) -> None:
super().init_meta(dataset, copy_from=copy_from)
[docs] def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, skip: Optional[int] = 0, **kwd) -> None:
super().set_meta(dataset, overwrite=overwrite, skip=skip, **kwd)
headers = iter_headers(dataset.get_file_name(), sep="\t")
for line in headers:
if not line[0].startswith("@"):
try:
dataset.metadata.sequence_count = int("".join(line)) # seq count sometimes preceded by tab
break
except Exception as e:
if not isinstance(self, PairwiseDistanceMatrix):
log.warning(f"DistanceMatrix set_meta {e}")
[docs]@build_sniff_from_prefix
class LowerTriangleDistanceMatrix(DistanceMatrix):
file_ext = "mothur.lower.dist"
[docs] def __init__(self, **kwd):
"""Initialize secondary structure map datatype"""
super().__init__(**kwd)
[docs] def init_meta(self, dataset: HasMetadata, copy_from: Optional[HasMetadata] = None) -> None:
super().init_meta(dataset, copy_from=copy_from)
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is a lower-triangle distance matrix (phylip) format
The first line has the number of sequences in the matrix.
The remaining lines have the sequence name followed by a list of distances from all preceeding sequences
5 # possibly but not always preceded by a tab :/
U68589
U68590 0.3371
U68591 0.3609 0.3782
U68592 0.4155 0.3197 0.4148
U68593 0.2872 0.1690 0.3361 0.2842
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.lower.dist' )
>>> LowerTriangleDistanceMatrix().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.lower.dist' )
>>> LowerTriangleDistanceMatrix().sniff( fname )
False
"""
numlines = 300
headers = iter_headers(file_prefix, sep="\t", count=numlines)
line_num = 0
for line in headers:
if not line[0].startswith("@"):
# first line should contain the number of sequences in the file
if line_num == 0:
if len(line) > 2:
return False
else:
try:
sequence_count = int("".join(line))
assert sequence_count > 0
except ValueError:
return False
else:
# number of fields should equal the line number
if len(line) != (line_num):
return False
try:
# Distances should be floats
for column in line[2:]:
float(column)
except ValueError:
return False
line_num += 1
# check if the number of lines in the file was as expected
if line_num == sequence_count + 1 or line_num == numlines + 1:
return True
return False
[docs]@build_sniff_from_prefix
class SquareDistanceMatrix(DistanceMatrix):
file_ext = "mothur.square.dist"
[docs] def init_meta(self, dataset: HasMetadata, copy_from: Optional[HasMetadata] = None) -> None:
super().init_meta(dataset, copy_from=copy_from)
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is a square distance matrix (Column-formatted distance matrix) format
The first line has the number of sequences in the matrix.
The following lines have the sequence name in the first column plus a column for the distance to each sequence
in the row order in which they appear in the matrix.
3
U68589 0.0000 0.3371 0.3610
U68590 0.3371 0.0000 0.3783
U68590 0.3371 0.0000 0.3783
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.square.dist' )
>>> SquareDistanceMatrix().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.square.dist' )
>>> SquareDistanceMatrix().sniff( fname )
False
"""
numlines = 300
headers = iter_headers(file_prefix, sep="\t", count=numlines)
line_num = 0
for line in headers:
if not line[0].startswith("@"):
if line_num == 0:
if len(line) > 2:
return False
else:
try:
sequence_count = int("".join(line))
assert sequence_count > 0
except ValueError:
return False
else:
# number of fields should equal the number of sequences
if len(line) != sequence_count + 1:
return False
try:
# Distances should be floats
for column in line[2:]:
float(column)
except ValueError:
return False
line_num += 1
# check if the number of lines in the file was as expected
if line_num == sequence_count + 1 or line_num == numlines + 1:
return True
return False
[docs]@build_sniff_from_prefix
class PairwiseDistanceMatrix(DistanceMatrix, Tabular):
file_ext = "mothur.pair.dist"
[docs] def __init__(self, **kwd):
"""Initialize secondary structure map datatype"""
super().__init__(**kwd)
self.column_names = ["Sequence", "Sequence", "Distance"]
self.column_types = ["str", "str", "float"]
[docs] def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, skip: Optional[int] = None, **kwd) -> None:
super().set_meta(dataset, overwrite=overwrite, skip=skip, **kwd)
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format
The first and second columns have the sequence names and the third column is the distance between those sequences.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.pair.dist' )
>>> PairwiseDistanceMatrix().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.pair.dist' )
>>> PairwiseDistanceMatrix().sniff( fname )
False
"""
headers = iter_headers(file_prefix, sep="\t")
count = 0
names = [False, False]
for line in headers:
if line[0].startswith("@"):
continue
if len(line) != 3:
return False
# check if col3 contains distances (floats)
try:
float(line[2])
try:
# See if it's also an integer
int(line[2])
except ValueError:
# At least one value is not an integer
all_ints = False
except ValueError:
return False
count += 1
# check if col1 and col2 likely contain names
for c in [0, 1]:
try:
float(line[c])
except ValueError:
names[c] = True
if not names[0] or not names[1]:
return False
if count > 2:
return not all_ints
return False
[docs]class Names(Tabular):
file_ext = "mothur.names"
[docs] def __init__(self, **kwd):
"""
http://www.mothur.org/wiki/Name_file
Name file shows the relationship between a representative sequence(col 1) and the sequences(comma-separated) it represents(col 2)
"""
super().__init__(**kwd)
self.column_names = ["name", "representatives"]
self.columns = 2
[docs]class Summary(Tabular):
file_ext = "mothur.summary"
[docs] def __init__(self, **kwd):
"""summarizes the quality of sequences in an unaligned or aligned fasta-formatted sequence file"""
super().__init__(**kwd)
self.column_names = ["seqname", "start", "end", "nbases", "ambigs", "polymer"]
self.columns = 6
[docs]class Group(Tabular):
file_ext = "mothur.groups"
MetadataElement(name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[])
[docs] def __init__(self, **kwd):
"""
http://www.mothur.org/wiki/Groups_file
Group file assigns sequence (col 1) to a group (col 2)
"""
super().__init__(**kwd)
self.column_names = ["name", "group"]
self.columns = 2
[docs] def set_meta(
self,
dataset: DatasetProtocol,
overwrite: bool = True,
skip: Optional[int] = None,
max_data_lines: Optional[int] = None,
**kwd,
) -> None:
super().set_meta(dataset, overwrite=overwrite, skip=skip, max_data_lines=max_data_lines, **kwd)
group_names = set()
headers = iter_headers(dataset.get_file_name(), sep="\t", count=-1)
for line in headers:
if len(line) > 1:
group_names.add(line[1])
dataset.metadata.groups = list(group_names)
[docs]class AccNos(Tabular):
file_ext = "mothur.accnos"
[docs] def __init__(self, **kwd):
"""A list of names"""
super().__init__(**kwd)
self.column_names = ["name"]
self.columns = 1
[docs]@build_sniff_from_prefix
class Oligos(Text):
file_ext = "mothur.oligos"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
http://www.mothur.org/wiki/Oligos_File
Determines whether the file is a otu (operational taxonomic unit) format
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.oligos' )
>>> Oligos().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.oligos' )
>>> Oligos().sniff( fname )
False
"""
headers = iter_headers(file_prefix, sep="\t")
count = 0
for line in headers:
if not line[0].startswith("@") and not line[0].startswith("#"):
if len(line) == 2 and line[0] in ["forward", "reverse"]:
count += 1
continue
elif len(line) == 3 and line[0] == "barcode":
count += 1
continue
else:
return False
if count > 0:
return True
return False
[docs]@build_sniff_from_prefix
class Frequency(Tabular):
file_ext = "mothur.freq"
[docs] def __init__(self, **kwd):
"""A list of names"""
super().__init__(**kwd)
self.column_names = ["position", "frequency"]
self.column_types = ["int", "float"]
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is a frequency tabular format for chimera analysis
.. code-block::
#1.14.0
0 0.000
1 0.000
...
155 0.975
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.freq' )
>>> Frequency().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.freq' )
>>> Frequency().sniff( fname )
False
>>> # Expression count matrix (EdgeR wrapper)
>>> fname = get_test_fname( 'mothur_datatypetest_false_2.mothur.freq' )
>>> Frequency().sniff( fname )
False
"""
headers = iter_headers(file_prefix, sep="\t")
count = 0
for line in headers:
if not line[0].startswith("@"):
# first line should be #<version string>
if count == 0:
if not line[0].startswith("#") or len(line) != 1:
return False
else:
# all other lines should be <int> <float>
if len(line) != 2:
return False
try:
int(line[0])
float(line[1])
if line[1].find(".") == -1:
return False
except Exception:
return False
count += 1
if count > 1:
return True
return False
[docs]@build_sniff_from_prefix
class Quantile(Tabular):
file_ext = "mothur.quan"
MetadataElement(
name="filtered",
default=False,
no_value=False,
optional=True,
desc="Quantiles calculated using a mask",
readonly=True,
)
MetadataElement(
name="masked",
default=False,
no_value=False,
optional=True,
desc="Quantiles calculated using a frequency filter",
readonly=True,
)
[docs] def __init__(self, **kwd):
"""Quantiles for chimera analysis"""
super().__init__(**kwd)
self.column_names = ["num", "ten", "twentyfive", "fifty", "seventyfive", "ninetyfive", "ninetynine"]
self.column_types = ["int", "float", "float", "float", "float", "float", "float"]
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is a quantiles tabular format for chimera analysis
.. code-block::
1 0 0 0 0 0 0
2 0.309198 0.309198 0.37161 0.37161 0.37161 0.37161
3 0.510982 0.563213 0.693529 0.858939 1.07442 1.20608
...
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.quan' )
>>> Quantile().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.quan' )
>>> Quantile().sniff( fname )
False
"""
headers = iter_headers(file_prefix, sep="\t")
count = 0
for line in headers:
if not line[0].startswith("@") and not line[0].startswith("#"):
if len(line) != 7:
return False
try:
int(line[0])
float(line[1])
float(line[2])
float(line[3])
float(line[4])
float(line[5])
float(line[6])
except Exception:
return False
count += 1
if count > 0:
return True
return False
[docs]@build_sniff_from_prefix
class LaneMask(Text):
file_ext = "mothur.filter"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is a lane mask filter: 1 line consisting of zeros and ones.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.filter' )
>>> LaneMask().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.filter' )
>>> LaneMask().sniff( fname )
False
"""
headers = get_headers(file_prefix, sep="\t", count=2)
if len(headers) != 1 or len(headers[0]) != 1:
return False
if len(headers[0][0]) < 1000:
# these filter files should be relatively big
return False
if not re.match("^[01]+$", headers[0][0]):
return False
return True
[docs]class CountTable(Tabular):
MetadataElement(name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[])
file_ext = "mothur.count_table"
[docs] def __init__(self, **kwd):
"""
http://www.mothur.org/wiki/Count_File
A table with first column names and following columns integer counts
# Example 1:
Representative_Sequence total
U68630 1
U68595 1
U68600 1
# Example 2 (with group columns):
Representative_Sequence total forest pasture
U68630 1 1 0
U68595 1 1 0
U68600 1 1 0
U68591 1 1 0
U68647 1 0 1
"""
super().__init__(**kwd)
self.column_names = ["name", "total"]
[docs] def set_meta(
self,
dataset: DatasetProtocol,
overwrite: bool = True,
skip: Optional[int] = 1,
max_data_lines: Optional[int] = None,
**kwd,
) -> None:
super().set_meta(dataset, overwrite=overwrite, **kwd)
headers = get_headers(dataset.get_file_name(), sep="\t", count=1)
colnames = headers[0]
dataset.metadata.column_types = ["str"] + (["int"] * (len(headers[0]) - 1))
if len(colnames) > 1:
dataset.metadata.columns = len(colnames)
if len(colnames) > 2:
dataset.metadata.groups = colnames[2:]
dataset.metadata.comment_lines = 1
if isinstance(dataset.metadata.data_lines, int):
dataset.metadata.data_lines -= 1
[docs]@build_sniff_from_prefix
class RefTaxonomy(Tabular):
file_ext = "mothur.ref.taxonomy"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is a Reference Taxonomy
http://www.mothur.org/wiki/Taxonomy_outline
A table with 2 or 3 columns:
- SequenceName
- Taxonomy (semicolon-separated taxonomy in descending order)
- integer ?
Example: 2-column (http://www.mothur.org/wiki/Taxonomy_outline)
.. code-block::
X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma;
X97975.1 Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida;
AF052717.1 Eukaryota;Parabasalidea;
Example: 3-column (http://vamps.mbl.edu/resources/databases.php)
.. code-block::
v3_AA008 Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus 5
v3_AA016 Bacteria 120
v3_AA019 Archaea;Crenarchaeota;Marine_Group_I 1
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.ref.taxonomy' )
>>> RefTaxonomy().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.ref.taxonomy' )
>>> RefTaxonomy().sniff( fname )
False
"""
headers = iter_headers(file_prefix, sep="\t", count=300)
count = 0
pat_prog = re.compile("^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$")
found_semicolons = False
for line in headers:
if not line[0].startswith("@") and not line[0].startswith("#"):
if not (2 <= len(line) <= 3):
return False
if not pat_prog.match(line[1]):
return False
if not found_semicolons and line[1].find(";") > -1:
found_semicolons = True
if len(line) == 3:
try:
int(line[2])
except Exception:
return False
count += 1
if count > 0:
# Require that at least one entry has semicolons in the 2nd column
return found_semicolons
return False
[docs]class ConsensusTaxonomy(Tabular):
file_ext = "mothur.cons.taxonomy"
[docs] def __init__(self, **kwd):
"""A list of names"""
super().__init__(**kwd)
self.column_names = ["OTU", "count", "taxonomy"]
[docs]class TaxonomySummary(Tabular):
file_ext = "mothur.tax.summary"
[docs] def __init__(self, **kwd):
"""A Summary of taxon classification"""
super().__init__(**kwd)
self.column_names = ["taxlevel", "rankID", "taxon", "daughterlevels", "total"]
[docs]@build_sniff_from_prefix
class Axes(Tabular):
file_ext = "mothur.axes"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is an axes format
The first line may have column headings.
The following lines have the name in the first column plus float columns for each axis.
.. code-block::
group axis1 axis2
forest 0.000000 0.145743
pasture 0.145743 0.000000
.. code-block::
axis1 axis2
U68589 0.262608 -0.077498
U68590 0.027118 0.195197
U68591 0.329854 0.014395
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.axes' )
>>> Axes().sniff( fname )
True
>>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.axes' )
>>> Axes().sniff( fname )
False
"""
headers = iter_headers(file_prefix, sep="\t")
count = 0
col_cnt = None
all_integers = True
for line in headers:
if count != 0:
if col_cnt is None:
col_cnt = len(line)
if col_cnt < 2:
return False
else:
if len(line) != col_cnt:
return False
try:
for i in range(1, col_cnt):
check = float(line[i])
# Check abs value is <= 1.0
if abs(check) > 1.0:
return False
# Also test for whether value is an integer
try:
check = int(line[i])
except ValueError:
all_integers = False
except ValueError:
return False
count += 1
if count > 0:
return not all_integers
return False
[docs]class SffFlow(Tabular):
"""
https://mothur.org/wiki/flow_file/
The first line is the total number of flow values - 800 for Titanium data. For GS FLX it would be 400.
Following lines contain:
- SequenceName
- the number of useable flows as defined by 454's software
- the flow intensity for each base going in the order of TACG.
Example:
.. code-block::
800
GQY1XT001CQL4K 85 1.04 0.00 1.00 0.02 0.03 1.02 0.05 ...
GQY1XT001CQIRF 84 1.02 0.06 0.98 0.06 0.09 1.05 0.07 ...
GQY1XT001CF5YW 88 1.02 0.02 1.01 0.04 0.06 1.02 0.03 ...
"""
file_ext = "mothur.sff.flow"
MetadataElement(
name="flow_values", default="", no_value="", optional=True, desc="Total number of flow values", readonly=True
)
MetadataElement(
name="flow_order", default="TACG", no_value="TACG", desc="Total number of flow values", readonly=False
)
[docs] def set_meta(
self,
dataset: DatasetProtocol,
overwrite: bool = True,
skip: Optional[int] = 1,
max_data_lines: Optional[int] = None,
**kwd,
) -> None:
super().set_meta(dataset, overwrite=overwrite, skip=1, max_data_lines=max_data_lines, **kwd)
headers = get_headers(dataset.get_file_name(), sep="\t", count=1)
try:
flow_values = int(headers[0][0])
dataset.metadata.flow_values = flow_values
except Exception as e:
log.warning(f"SffFlow set_meta {e}")
[docs] def make_html_table(self, dataset: DatasetProtocol, skipchars: Optional[List] = None, **kwargs) -> str:
"""Create HTML table, used for displaying peek"""
skipchars = skipchars or []
try:
out = '<table cellspacing="0" cellpadding="3">'
# Generate column header
out += "<tr>"
out += "<th>1. Name</th>"
out += "<th>2. Flows</th>"
for i in range(3, dataset.metadata.columns + 1):
base = dataset.metadata.flow_order[(i + 1) % 4]
out += "<th>%d. %s</th>" % (i - 2, base)
out += "</tr>"
out += self.make_html_peek_rows(dataset, skipchars=skipchars)
out += "</table>"
except Exception as exc:
out = f"Can't create peek: {unicodify(exc)}"
return out