Warning
This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.
Source code for galaxy.visualization.data_providers.phyloviz.newickparser
import re
from .baseparser import Base_Parser, PhyloTree
[docs]class Newick_Parser(Base_Parser):
    """For parsing trees stored in the newick format (.nhx)
    It is necessarily more complex because this parser is later extended by Nexus for parsing newick as well.."""
[docs]    def parseFile(self, filePath):
        """Parses a newick file to obtain the string inside. Returns: jsonableDict"""
        with open(filePath, "r") as newickFile:
            newickString = newickFile.read()
            newickString = newickString.replace("\n", "").replace("\r", "")
            return [self.parseData(newickString)], "Success"
[docs]    def parseData(self, newickString):
        """To be called on a newickString directly to parse it. Returns: jsonableDict"""
        return self._parseNewickToJson(newickString)
    def _parseNewickToJson(self, newickString, treeName=None, nameMap=None):
        """parses a newick representation of a tree into a PhyloTree data structure,
        which can be easily converted to json"""
        self.phyloTree = PhyloTree()
        newickString = self.cleanNewickString(newickString)
        if nameMap:
            newickString = self._mapName(newickString, nameMap)
        self.phyloTree.root = self.parseNode(newickString, 0)
        if nameMap:
            self.phyloTree.addAttributesToRoot({"treeName": treeName})
        return self.phyloTree.generateJsonableDict()
[docs]    def cleanNewickString(self, rawNewick):
        """removing semi colon, and illegal json characters (\,',") and white spaces"""
        return re.sub(r'\s|;|\"|\'|\\', '', rawNewick)
    def _makeNodesFromString(self, string, depth):
        """elements separated by comma could be empty"""
        if string.find("(") != -1:
            raise Exception("Tree is not well form, location: " + string)
        childrenString = string.split(",")
        childrenNodes = []
        for childString in childrenString:
            if len(childString) == 0:
                continue
            nodeInfo = childString.split(":")
            name, length, bootstrap = "", None, -1
            if len(nodeInfo) == 2:  # has length info
                length = nodeInfo[1]
                # checking for bootstap values
                name = nodeInfo[0]
                try:    # Nexus may bootstrap in names position
                    name = float(name)
                    if 0 <= name <= 1:
                        bootstrap = name
                    elif 1 <= name <= 100:
                        bootstrap = name / 100
                    name = ""
                except ValueError:
                    name = nodeInfo[0]
            else:
                name = nodeInfo[0]      # string only contains name
            node = self.phyloTree.makeNode(name, length=length, depth=depth, bootstrap=bootstrap)
            childrenNodes += [node]
        return childrenNodes
    def _mapName(self, newickString, nameMap):
        """
        Necessary to replace names of terms inside nexus representation
        Also, it's here because Mailaud's doesnt deal with id_strings outside of quotes(" ")
        """
        newString = ""
        start = 0
        end = 0
        for i in range(len(newickString)):
            if newickString[i] == "(" or newickString[i] == ",":
                if re.match(r"[,(]", newickString[i + 1:]):
                    continue
                else:
                    end = i + 1
                    # i now refers to the starting position of the term to be replaced,
                    # we will next find j which is the ending pos of the term
                    for j in range(i + 1, len(newickString)):
                        enclosingSymbol = newickString[j]   # the immediate symbol after a common or left bracket which denotes the end of a term
                        if enclosingSymbol == ")" or enclosingSymbol == ":" or enclosingSymbol == ",":
                            termToReplace = newickString[end:j]
                            newString += newickString[start : end] + nameMap[termToReplace]  # + "'"  "'" +
                            start = j
                            break
        newString += newickString[start:]
        return newString
[docs]    def parseNode(self, string, depth):
        """
        Recursive method for parsing newick string, works by stripping down the string into substring
        of newick contained with brackers, which is used to call itself.
        Eg ... ( A, B, (D, E)C, F, G ) ...
        We will make the preceeding nodes first A, B, then the internal node C, its children D, E,
        and finally the succeeding nodes F, G
        """
        # Base case where there is only an empty string
        if string == "":
            return
            # Base case there it's only an internal claude
        if string.find("(") == -1:
            return self._makeNodesFromString(string, depth)
        nodes = []      # nodes refer to the nodes on this level
        start = 0
        lenOfPreceedingInternalNodeString = 0
        bracketStack = []
        for j in range(len(string)):
            if string[j] == "(":    # finding the positions of all the open brackets
                bracketStack.append(j)
                continue
            if string[j] == ")":    # finding the positions of all the closed brackets to extract claude
                i = bracketStack.pop()
                if len(bracketStack) == 0:  # is child of current node
                    InternalNode = None
                    # First flat call to make nodes of the same depth but from the preceeding string.
                    startSubstring = string[start + lenOfPreceedingInternalNodeString: i]
                    preceedingNodes = self._makeNodesFromString(startSubstring, depth)
                    nodes += preceedingNodes
                    # Then We will try to see if the substring has any internal nodes first, make it then make nodes preceeding it and succeeding it.
                    if j + 1 < len(string):
                        stringRightOfBracket = string[j + 1:]      # Eg. '(b:0.4,a:0.3)c:0.3, stringRightOfBracket = c:0.3
                        match = re.search(r"[\)\,\(]", stringRightOfBracket)
                        if match:
                            indexOfNextSymbol = match.start()
                            stringRepOfInternalNode = stringRightOfBracket[:indexOfNextSymbol]
                            internalNodes = self._makeNodesFromString(stringRepOfInternalNode, depth)
                            if len(internalNodes) > 0:
                                InternalNode = internalNodes[0]
                            lenOfPreceedingInternalNodeString = len(stringRepOfInternalNode)
                        else:   # sometimes the node can be the last element of a string
                            InternalNode = self._makeNodesFromString(string[j + 1:], depth)[0]
                            lenOfPreceedingInternalNodeString = len(string) - j
                    if InternalNode is None:       # creating a generic node if it is unnamed
                        InternalNode = self.phyloTree.makeNode("", depth=depth, isInternal=True)  # "internal-" + str(depth)
                        lenOfPreceedingInternalNodeString = 0
                    # recussive call to make the internal claude
                    childSubString = string[i + 1 : j]
                    InternalNode.addChildNode(self.parseNode(childSubString, depth + 1))
                    nodes.append(InternalNode)  # we append the internal node later to preserve order
                    start = j + 1
                continue
        if depth == 0:    # if it's the root node, we do nothing about it and return
            return nodes[0]
        # Adding last most set of children
        endString = string[start:]
        if string[start - 1] == ")":  # if the symbol belongs to an internal node which is created previously, then we remove it from the string left to parse
            match = re.search(r"[\)\,\(]", endString)
            if match:
                endOfNodeName = start + match.start() + 1
                endString = string[endOfNodeName:]
                nodes += self._makeNodesFromString(endString, depth)
        return nodes