Warning

This document is for an in-development version of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.tools.search

"""
Module for building and searching the index of tools
installed within this Galaxy. Before changing index-building
or searching related parts it is deeply recommended to read
through the library docs at https://whoosh.readthedocs.io.
"""
import logging
import os
import re

from whoosh import (
    analysis,
    index,
)
from whoosh.analysis import StandardAnalyzer
from whoosh.fields import (
    ID,
    KEYWORD,
    Schema,
    TEXT
)
from whoosh.qparser import MultifieldParser
from whoosh.qparser import OrGroup
from whoosh.scoring import BM25F
from whoosh.writing import AsyncWriter

from galaxy.util import ExecutionTimer
from galaxy.web.framework.helpers import to_unicode

log = logging.getLogger(__name__)


[docs]def get_or_create_index(index_dir, schema): if not os.path.exists(index_dir): os.makedirs(index_dir) if index.exists_in(index_dir): idx = index.open_dir(index_dir) try: assert idx.schema == schema return idx except AssertionError: log.warning("Index at '%s' uses outdated schema, creating new index", index_dir) return index.create_in(index_dir, schema=schema)
[docs]class ToolBoxSearch(object): """ Support searching tools in a toolbox. This implementation uses the Whoosh search library. """
[docs] def __init__(self, toolbox, index_dir=None, index_help=True): self.schema = Schema(id=ID(stored=True, unique=True), stub=KEYWORD, name=TEXT(analyzer=analysis.SimpleAnalyzer()), description=TEXT, section=TEXT, help=TEXT, labels=KEYWORD) self.rex = analysis.RegexTokenizer() self.index_dir = index_dir self.toolbox = toolbox self.index = self._index_setup() # We keep track of how many times the tool index has been rebuilt. # We start at -1, so that after the first index the count is at 0, # which is the same as the toolbox reload count. This way we can skip # reindexing if the index count is equal to the toolbox reload count. self.index_count = -1
def _index_setup(self): return get_or_create_index(index_dir=self.index_dir, schema=self.schema)
[docs] def build_index(self, tool_cache, index_help=True): """ Prepare search index for tools loaded in toolbox. Use `tool_cache` to determine which tools need indexing and which tools should be expired. """ log.debug('Starting to build toolbox index.') self.index_count += 1 execution_timer = ExecutionTimer() with self.index.reader() as reader: # Index ocasionally contains empty stored fields indexed_tool_ids = {f['id'] for f in reader.all_stored_fields() if f} tool_ids_to_remove = (indexed_tool_ids - set(tool_cache._tool_paths_by_id.keys())).union(tool_cache._removed_tool_ids) with AsyncWriter(self.index) as writer: for tool_id in tool_ids_to_remove: writer.delete_by_term('id', tool_id) for tool_id in tool_cache._new_tool_ids - indexed_tool_ids: tool = tool_cache.get_tool_by_id(tool_id) if tool and tool.is_latest_version: if tool.hidden: # we check if there is an older tool we can return if tool.lineage: for tool_version in reversed(tool.lineage.get_versions()): tool = tool_cache.get_tool_by_id(tool_version.id) if tool and not tool.hidden: tool_id = tool.id break else: continue else: continue add_doc_kwds = self._create_doc(tool_id=tool_id, tool=tool, index_help=index_help) writer.update_document(**add_doc_kwds) log.debug("Toolbox index finished %s", execution_timer)
def _create_doc(self, tool_id, tool, index_help=True): # Do not add data managers to the public index if tool.tool_type == 'manage_data': return {} add_doc_kwds = { "id": tool_id, "description": to_unicode(tool.description), "section": to_unicode(tool.get_panel_section()[1] if len(tool.get_panel_section()) == 2 else ''), "help": to_unicode("") } if tool.name.find('-') != -1: # Replace hyphens, since they are wildcards in Whoosh causing false positives add_doc_kwds['name'] = (' ').join(token.text for token in self.rex(to_unicode(tool.name))) else: add_doc_kwds['name'] = to_unicode(tool.name) if tool.guid: # Create a stub consisting of owner, repo, and tool from guid slash_indexes = [m.start() for m in re.finditer('/', tool.guid)] id_stub = tool.guid[(slash_indexes[1] + 1): slash_indexes[4]] add_doc_kwds['stub'] = (' ').join(token.text for token in self.rex(to_unicode(id_stub))) else: add_doc_kwds['stub'] = to_unicode(id) if tool.labels: add_doc_kwds['labels'] = to_unicode(" ".join(tool.labels)) if index_help: raw_help = tool.raw_help if raw_help: try: add_doc_kwds['help'] = to_unicode(raw_help) except Exception: # Don't fail to build index just because help can't be converted. pass return add_doc_kwds
[docs] def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher self.searcher = self.index.searcher( weighting=BM25F( field_B={'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'labels_B': float(tool_label_boost), 'stub_B': float(tool_stub_boost), 'help_B': float(tool_help_boost)} ) ) # Use OrGroup to change the default operation for joining multiple terms to logical OR. # This means e.g. for search 'bowtie of king arthur' a document that only has 'bowtie' will be a match. # https://whoosh.readthedocs.io/en/latest/api/qparser.html#whoosh.qparser.MultifieldPlugin # However this changes scoring i.e. searching 'bowtie of king arthur' a document with 'arthur arthur arthur' # would have a higher score than a document with 'bowtie arthur' which is usually unexpected for a user. # Hence we introduce a bonus on multi-hits using the 'factory()' method using a scaling factor between 0-1. # https://whoosh.readthedocs.io/en/latest/parsing.html#searching-for-any-terms-instead-of-all-terms-by-default og = OrGroup.factory(0.9) self.parser = MultifieldParser(['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema, group=og) cleaned_query = q.lower() # Replace hyphens, since they are wildcards in Whoosh causing false positives if cleaned_query.find('-') != -1: cleaned_query = (' ').join(token.text for token in self.rex(to_unicode(cleaned_query))) if tool_enable_ngram_search is True: rval = self._search_ngrams(cleaned_query, tool_ngram_minsize, tool_ngram_maxsize, tool_search_limit) return rval else: # Use asterisk Whoosh wildcard so e.g. 'bow' easily matches 'bowtie' parsed_query = self.parser.parse(cleaned_query + '*') hits = self.searcher.search(parsed_query, limit=float(tool_search_limit), sortedby='') return [hit['id'] for hit in hits]
def _search_ngrams(self, cleaned_query, tool_ngram_minsize, tool_ngram_maxsize, tool_search_limit): """ Break tokens into ngrams and search on those instead. This should make searching more resistant to typos and unfinished words. See docs at https://whoosh.readthedocs.io/en/latest/ngrams.html """ hits_with_score = {} token_analyzer = StandardAnalyzer() | analysis.NgramFilter(minsize=int(tool_ngram_minsize), maxsize=int(tool_ngram_maxsize)) ngrams = [token.text for token in token_analyzer(cleaned_query)] for query in ngrams: # Get the tool list with respective scores for each qgram curr_hits = self.searcher.search(self.parser.parse('*' + query + '*'), limit=float(tool_search_limit)) for i, curr_hit in enumerate(curr_hits): is_present = False for prev_hit in hits_with_score: # Check if the tool appears again for the next qgram search if curr_hit['id'] == prev_hit: is_present = True # Add the current score with the previous one if the # tool appears again for the next qgram hits_with_score[prev_hit] = curr_hits.score(i) + hits_with_score[prev_hit] # Add the tool if not present to the collection with its score if not is_present: hits_with_score[curr_hit['id']] = curr_hits.score(i) # Sort the results based on aggregated BM25 score in decreasing order of scores hits_with_score = sorted(hits_with_score.items(), key=lambda x: x[1], reverse=True) # Return the tool ids return [item[0] for item in hits_with_score[0:int(tool_search_limit)]]