Source code for galaxy.visualization.data_providers.phyloviz.newickparser

import re

from .baseparser import Base_Parser, PhyloTree


[docs]class Newick_Parser(Base_Parser): """For parsing trees stored in the newick format (.nhx) It is necessarily more complex because this parser is later extended by Nexus for parsing newick as well.."""
[docs] def __init__(self): super().__init__()
[docs] def parseFile(self, filePath): """Parses a newick file to obtain the string inside. Returns: jsonableDict""" with open(filePath) as newickFile: newickString = newickFile.read() newickString = newickString.replace("\n", "").replace("\r", "") return [self.parseData(newickString)], "Success"
[docs] def parseData(self, newickString): """To be called on a newickString directly to parse it. Returns: jsonableDict""" return self._parseNewickToJson(newickString)
def _parseNewickToJson(self, newickString, treeName=None, nameMap=None): """parses a newick representation of a tree into a PhyloTree data structure, which can be easily converted to json""" self.phyloTree = PhyloTree() newickString = self.cleanNewickString(newickString) if nameMap: newickString = self._mapName(newickString, nameMap) self.phyloTree.root = self.parseNode(newickString, 0) if nameMap: self.phyloTree.addAttributesToRoot({"treeName": treeName}) return self.phyloTree.generateJsonableDict()
[docs] def cleanNewickString(self, rawNewick): r"""removing semi colon, and illegal json characters (\,',") and white spaces""" return re.sub(r'\s|;|\"|\'|\\', r'', rawNewick)
def _makeNodesFromString(self, string, depth): """elements separated by comma could be empty""" if string.find("(") != -1: raise Exception(f"Tree is not well form, location: {string}") childrenString = string.split(",") childrenNodes = [] for childString in childrenString: if len(childString) == 0: continue nodeInfo = childString.split(":") name, length, bootstrap = "", None, -1 if len(nodeInfo) == 2: # has length info length = nodeInfo[1] # checking for bootstap values name = nodeInfo[0] try: # Nexus may bootstrap in names position name = float(name) if 0 <= name <= 1: bootstrap = name elif 1 <= name <= 100: bootstrap = name / 100 name = "" except ValueError: name = nodeInfo[0] else: name = nodeInfo[0] # string only contains name node = self.phyloTree.makeNode(name, length=length, depth=depth, bootstrap=bootstrap) childrenNodes += [node] return childrenNodes def _mapName(self, newickString, nameMap): """ Necessary to replace names of terms inside nexus representation Also, it's here because Mailaud's doesnt deal with id_strings outside of quotes(" ") """ newString = "" start = 0 end = 0 for i in range(len(newickString)): if newickString[i] == "(" or newickString[i] == ",": if re.match(r"[,(]", newickString[i + 1:]): continue else: end = i + 1 # i now refers to the starting position of the term to be replaced, # we will next find j which is the ending pos of the term for j in range(i + 1, len(newickString)): enclosingSymbol = newickString[j] # the immediate symbol after a common or left bracket which denotes the end of a term if enclosingSymbol == ")" or enclosingSymbol == ":" or enclosingSymbol == ",": termToReplace = newickString[end:j] newString += newickString[start:end] + nameMap[termToReplace] # + "'" "'" + start = j break newString += newickString[start:] return newString
[docs] def parseNode(self, string, depth): """ Recursive method for parsing newick string, works by stripping down the string into substring of newick contained with brackers, which is used to call itself. Eg ... ( A, B, (D, E)C, F, G ) ... We will make the preceeding nodes first A, B, then the internal node C, its children D, E, and finally the succeeding nodes F, G """ # Base case where there is only an empty string if string == "": return # Base case there it's only an internal claude if string.find("(") == -1: return self._makeNodesFromString(string, depth) nodes = [] # nodes refer to the nodes on this level start = 0 lenOfPreceedingInternalNodeString = 0 bracketStack = [] for j in range(len(string)): if string[j] == "(": # finding the positions of all the open brackets bracketStack.append(j) continue if string[j] == ")": # finding the positions of all the closed brackets to extract claude i = bracketStack.pop() if len(bracketStack) == 0: # is child of current node InternalNode = None # First flat call to make nodes of the same depth but from the preceeding string. startSubstring = string[start + lenOfPreceedingInternalNodeString: i] preceedingNodes = self._makeNodesFromString(startSubstring, depth) nodes += preceedingNodes # Then We will try to see if the substring has any internal nodes first, make it then make nodes preceeding it and succeeding it. if j + 1 < len(string): stringRightOfBracket = string[j + 1:] # Eg. '(b:0.4,a:0.3)c:0.3, stringRightOfBracket = c:0.3 match = re.search(r"[\)\,\(]", stringRightOfBracket) if match: indexOfNextSymbol = match.start() stringRepOfInternalNode = stringRightOfBracket[:indexOfNextSymbol] internalNodes = self._makeNodesFromString(stringRepOfInternalNode, depth) if len(internalNodes) > 0: InternalNode = internalNodes[0] lenOfPreceedingInternalNodeString = len(stringRepOfInternalNode) else: # sometimes the node can be the last element of a string InternalNode = self._makeNodesFromString(string[j + 1:], depth)[0] lenOfPreceedingInternalNodeString = len(string) - j if InternalNode is None: # creating a generic node if it is unnamed InternalNode = self.phyloTree.makeNode("", depth=depth, isInternal=True) # "internal-" + str(depth) lenOfPreceedingInternalNodeString = 0 # recussive call to make the internal claude childSubString = string[i + 1:j] InternalNode.addChildNode(self.parseNode(childSubString, depth + 1)) nodes.append(InternalNode) # we append the internal node later to preserve order start = j + 1 continue if depth == 0: # if it's the root node, we do nothing about it and return return nodes[0] # Adding last most set of children endString = string[start:] if string[start - 1] == ")": # if the symbol belongs to an internal node which is created previously, then we remove it from the string left to parse match = re.search(r"[\)\,\(]", endString) if match: endOfNodeName = start + match.start() + 1 endString = string[endOfNodeName:] nodes += self._makeNodesFromString(endString, depth) return nodes