Source code for tool_shed.webapp.search.repo_search

"""Module for searching the toolshed repositories"""

import logging

import whoosh.index
from whoosh import scoring
from whoosh.fields import (
    KEYWORD,
    NUMERIC,
    Schema,
    STORED,
    TEXT,
)
from whoosh.qparser import MultifieldParser
from whoosh.query import (
    And,
    Every,
    Term,
)

from galaxy import exceptions
from galaxy.exceptions import ObjectNotFound
from galaxy.util.search import parse_filters

log = logging.getLogger(__name__)

schema = Schema(
    id=NUMERIC(stored=True),
    name=TEXT(field_boost=1.7, stored=True),
    description=TEXT(field_boost=1.5, stored=True),
    long_description=TEXT(stored=True),
    homepage_url=TEXT(stored=True),
    remote_repository_url=TEXT(stored=True),
    repo_owner_username=TEXT(stored=True),
    categories=KEYWORD(stored=True, commas=True, scorable=True),
    times_downloaded=STORED,
    approved=STORED,
    last_updated=STORED,
    repo_lineage=STORED,
    full_last_updated=STORED,
)


[docs]class RepoWeighting(scoring.BM25F): """ Affect the BM25G scoring model through the final method. source: https://groups.google.com/forum/#!msg/whoosh/1AKNbW8R_l8/XySW0OecH6gJ """ use_final = True
[docs] def final(self, searcher, docnum, score): # Arbitrary for now reasonable_hits = 100.0 stored_times_downloaded = searcher.stored_fields(docnum)["times_downloaded"] if not isinstance(stored_times_downloaded, int): times_downloaded = int(stored_times_downloaded) else: times_downloaded = stored_times_downloaded # Add 1 to prevent 0 being divided if times_downloaded == 0: times_downloaded = 1 popularity_modifier = times_downloaded / reasonable_hits cert_modifier = 2 if searcher.stored_fields(docnum)["approved"] == "yes" else 1 # Adjust the computed score for this document by the popularity # and by the certification level. final_score = score * popularity_modifier * cert_modifier return final_score
[docs]class RepoSearch:
[docs] def search(self, trans, search_term, page, page_size, boosts): """ Perform the search on the given search_term :param search_term: unicode encoded string with the search term(s) :param boosts: namedtuple containing custom boosts for searchfields, see api/repositories.py :param page_size: integer defining a length of one page :param page: integer with the number of page requested :returns results: dictionary containing hits themselves and the hits summary """ log.debug(f"raw search query: #{str(search_term)}") lower_search_term = search_term.lower() allow_query, search_term_without_filters = self._parse_reserved_filters(lower_search_term) log.debug(f"term without filters: #{str(search_term_without_filters)}") whoosh_index_dir = trans.app.config.whoosh_index_dir index_exists = whoosh.index.exists_in(whoosh_index_dir) if index_exists: index = whoosh.index.open_dir(whoosh_index_dir) try: # Some literature about BM25F: # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf # http://en.wikipedia.org/wiki/Okapi_BM25 # __Basically__ the higher number the bigger weight. repo_weighting = RepoWeighting( field_B={ "name_B": boosts.repo_name_boost, "description_B": boosts.repo_description_boost, "long_description_B": boosts.repo_long_description_boost, "homepage_url_B": boosts.repo_homepage_url_boost, "remote_repository_url_B": boosts.repo_remote_repository_url_boost, "repo_owner_username_B": boosts.repo_owner_username_boost, "categories_B": boosts.categories_boost, } ) searcher = index.searcher(weighting=repo_weighting) parser = MultifieldParser( [ "name", "description", "long_description", "homepage_url", "remote_repository_url", "repo_owner_username", "categories", ], schema=schema, ) # If user query has just filters prevent wildcard search. if len(search_term_without_filters) < 1: user_query = Every("name") sortedby = "name" else: user_query = parser.parse(f"*{search_term_without_filters}*") sortedby = "" try: hits = searcher.search_page( user_query, page, pagelen=page_size, filter=allow_query, terms=True, sortedby=sortedby ) log.debug(f"total hits: {str(len(hits))}") log.debug(f"scored hits: {str(hits.scored_length())}") except ValueError: raise ObjectNotFound("The requested page does not exist.") results = {} results["total_results"] = str(len(hits)) results["page"] = str(page) results["page_size"] = str(page_size) results["hits"] = [] for hit in hits: log.debug(f"matched terms: {str(hit.matched_terms())}") hit_dict = {} hit_dict["id"] = trans.security.encode_id(hit.get("id")) hit_dict["repo_owner_username"] = hit.get("repo_owner_username") hit_dict["name"] = hit.get("name") hit_dict["long_description"] = hit.get("long_description") hit_dict["remote_repository_url"] = hit.get("remote_repository_url") hit_dict["homepage_url"] = hit.get("homepage_url") hit_dict["description"] = hit.get("description") hit_dict["last_updated"] = hit.get("last_updated") hit_dict["full_last_updated"] = hit.get("full_last_updated") hit_dict["repo_lineage"] = hit.get("repo_lineage") hit_dict["categories"] = hit.get("categories") hit_dict["approved"] = hit.get("approved") hit_dict["times_downloaded"] = hit.get("times_downloaded") results["hits"].append({"repository": hit_dict, "score": hit.score}) return results finally: searcher.close() else: raise exceptions.InternalServerError("The search index file is missing.")
def _parse_reserved_filters(self, search_term): """ Support github-like filters for narrowing the results. Order of chunks does not matter, only recognized filter names are allowed. :param search_term: the original search str from user input :returns allow_query: whoosh Query object used for filtering results of searching in index :returns search_term_without_filters: str that represents user's search phrase without the filters >>> rs = RepoSearch() >>> rs._parse_reserved_filters("category:assembly") (And([Term('categories', 'assembly')]), '') >>> rs._parse_reserved_filters("category:assembly abyss") (And([Term('categories', 'assembly')]), 'abyss') >>> rs._parse_reserved_filters("category:'Climate Analysis' psy_maps") (And([Term('categories', 'Climate Analysis')]), 'psy_maps') >>> rs._parse_reserved_filters("climate category:'Climate Analysis' owner:'bjoern gruening' psy_maps") (And([Term('categories', 'Climate Analysis'), Term('repo_owner_username', 'bjoern gruening')]), 'climate psy_maps') >>> rs._parse_reserved_filters("climate category:'John Says This Fails' owner:'bjoern gruening' psy_maps") (And([Term('categories', 'John Says This Fails'), Term('repo_owner_username', 'bjoern gruening')]), 'climate psy_maps') >>> rs._parse_reserved_filters("climate o:'bjoern gruening' middle strings c:'John Says This Fails' psy_maps") (And([Term('repo_owner_username', 'bjoern gruening'), Term('categories', 'John Says This Fails')]), 'climate middle strings psy_maps') >>> rs._parse_reserved_filters("abyss category:assembly") (And([Term('categories', 'assembly')]), 'abyss') >>> rs._parse_reserved_filters("abyss category:assembly greg") (And([Term('categories', 'assembly')]), 'abyss greg') >>> rs._parse_reserved_filters("owner:greg") (And([Term('repo_owner_username', 'greg')]), '') >>> rs._parse_reserved_filters("owner:greg category:assembly abyss") (And([Term('repo_owner_username', 'greg'), Term('categories', 'assembly')]), 'abyss') >>> rs._parse_reserved_filters("meaningoflife:42") (None, 'meaningoflife:42') """ filters = { "category": "categories", "c": "categories", "owner": "repo_owner_username", "o": "repo_owner_username", } allow_query, search_term_without_filters = parse_filters(search_term, filters) allow_query = ( And([Term(t, v) for (t, v, _) in allow_query] if len(allow_query) > 0 else None) if allow_query else None ) return allow_query, search_term_without_filters