Warning

This document is for an in-development version of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.tools.search

"""
Module for building and searching the index of tools
installed within this Galaxy.
"""
import logging
import re
import tempfile

from bleach import clean
from whoosh import analysis
from whoosh.analysis import StandardAnalyzer
from whoosh.fields import (
    KEYWORD,
    Schema,
    STORED,
    TEXT
)
from whoosh.filedb.filestore import (
    FileStorage,
    RamStorage
)
from whoosh.qparser import MultifieldParser
from whoosh.scoring import BM25F

from galaxy.util import ExecutionTimer
from galaxy.web.framework.helpers import to_unicode

log = logging.getLogger(__name__)


[docs]class ToolBoxSearch(object): """ Support searching tools in a toolbox. This implementation uses the Whoosh search library. """
[docs] def __init__(self, toolbox, index_help=True): """ Create a searcher for `toolbox`. """ self.schema = Schema(id=STORED, stub=KEYWORD, name=TEXT(analyzer=analysis.SimpleAnalyzer()), description=TEXT, section=TEXT, help=TEXT, labels=KEYWORD) self.rex = analysis.RegexTokenizer() self.toolbox = toolbox self.storage, self.index = self._index_setup() # We keep track of how many times the tool index has been rebuilt. # We start at -1, so that after the first index the count is at 0, # which is the same as the toolbox reload count. This way we can skip # reindexing if the index count is equal to the toolbox reload count. self.index_count = -1
def _index_setup(self): RamStorage.temp_storage = _temp_storage # Works around https://bitbucket.org/mchaput/whoosh/issues/391/race-conditions-with-temp-storage storage = RamStorage() index = storage.create_index(self.schema) return storage, index
[docs] def build_index(self, tool_cache, index_help=True): """ Prepare search index for tools loaded in toolbox. Use `tool_cache` to determine which tools need indexing and which tools should be expired. """ log.debug('Starting to build toolbox index.') self.index_count += 1 execution_timer = ExecutionTimer() writer = self.index.writer() for tool_id in tool_cache._removed_tool_ids: writer.delete_by_term('id', tool_id) for tool_id in tool_cache._new_tool_ids: tool = tool_cache.get_tool_by_id(tool_id) if tool: add_doc_kwds = self._create_doc(tool_id=tool_id, tool=tool, index_help=index_help) writer.add_document(**add_doc_kwds) writer.commit() log.debug("Toolbox index finished %s", execution_timer)
def _create_doc(self, tool_id, tool, index_help=True): # Do not add data managers to the public index if tool.tool_type == 'manage_data': return {} add_doc_kwds = { "id": tool_id, "description": to_unicode(tool.description), "section": to_unicode(tool.get_panel_section()[1] if len(tool.get_panel_section()) == 2 else ''), "help": to_unicode("") } if tool.name.find('-') != -1: # Hyphens are wildcards in Whoosh causing bad things add_doc_kwds['name'] = (' ').join([token.text for token in self.rex(to_unicode(tool.name))]) else: add_doc_kwds['name'] = to_unicode(tool.name) if tool.guid: # Create a stub consisting of owner, repo, and tool from guid slash_indexes = [m.start() for m in re.finditer('/', tool.guid)] id_stub = tool.guid[(slash_indexes[1] + 1): slash_indexes[4]] add_doc_kwds['stub'] = (' ').join([token.text for token in self.rex(to_unicode(id_stub))]) else: add_doc_kwds['stub'] = to_unicode(id) if tool.labels: add_doc_kwds['labels'] = to_unicode(" ".join(tool.labels)) if index_help and tool.help: try: raw_html = tool.help.render(host_url="", static_path="") cleantext = clean(raw_html, tags=[''], strip=True).replace('\n', ' ') add_doc_kwds['help'] = to_unicode(cleantext) except Exception: # Don't fail to build index just because a help message # won't render. pass return add_doc_kwds
[docs] def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher( weighting=BM25F( field_B={'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'labels_B': float(tool_label_boost), 'stub_B': float(tool_stub_boost), 'help_B': float(tool_help_boost)} ) ) # Set query to search name, description, section, help, and labels. parser = MultifieldParser(['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema) # Hyphens are wildcards in Whoosh causing bad things if q.find('-') != -1: q = (' ').join([token.text for token in self.rex(to_unicode(q))]) # Perform tool search with ngrams if set to true in the config file if (tool_enable_ngram_search is True or tool_enable_ngram_search == "True"): hits_with_score = {} token_analyzer = StandardAnalyzer() | analysis.NgramFilter(minsize=int(tool_ngram_minsize), maxsize=int(tool_ngram_maxsize)) ngrams = [token.text for token in token_analyzer(q)] for query in ngrams: # Get the tool list with respective scores for each qgram curr_hits = searcher.search(parser.parse('*' + query + '*'), limit=float(tool_search_limit)) for i, curr_hit in enumerate(curr_hits): is_present = False for prev_hit in hits_with_score: # Check if the tool appears again for the next qgram search if curr_hit['id'] == prev_hit: is_present = True # Add the current score with the previous one if the # tool appears again for the next qgram hits_with_score[prev_hit] = curr_hits.score(i) + hits_with_score[prev_hit] # Add the tool if not present to the collection with its score if not is_present: hits_with_score[curr_hit['id']] = curr_hits.score(i) # Sort the results based on aggregated BM25 score in decreasing order of scores hits_with_score = sorted(hits_with_score.items(), key=lambda x: x[1], reverse=True) # Return the tool ids return [item[0] for item in hits_with_score[0:int(tool_search_limit)]] else: # Perform the search hits = searcher.search(parser.parse('*' + q + '*'), limit=float(tool_search_limit)) return [hit['id'] for hit in hits]
def _temp_storage(self, name=None): path = tempfile.mkdtemp() tempstore = FileStorage(path) return tempstore.create()