Source code for tool_shed.webapp.search.repo_search

"""Module for searching the toolshed repositories"""
import logging

import whoosh.index
from whoosh import scoring
from whoosh.fields import (
    KEYWORD,
    NUMERIC,
    Schema,
    STORED,
    TEXT,
)
from whoosh.qparser import MultifieldParser
from whoosh.query import (
    And,
    Every,
    Term,
)

from galaxy import exceptions
from galaxy.exceptions import ObjectNotFound
from galaxy.util.search import parse_filters

log = logging.getLogger(__name__)

schema = Schema(
    id=NUMERIC(stored=True),
    name=TEXT(field_boost=1.7, stored=True),
    description=TEXT(field_boost=1.5, stored=True),
    long_description=TEXT(stored=True),
    homepage_url=TEXT(stored=True),
    remote_repository_url=TEXT(stored=True),
    repo_owner_username=TEXT(stored=True),
    categories=KEYWORD(stored=True, commas=True, scorable=True),
    times_downloaded=STORED,
    approved=STORED,
    last_updated=STORED,
    repo_lineage=STORED,
    full_last_updated=STORED,
)


[docs]class RepoWeighting(scoring.BM25F):
    """
    Affect the BM25G scoring model through the final method.
    source: https://groups.google.com/forum/#!msg/whoosh/1AKNbW8R_l8/XySW0OecH6gJ
    """

    use_final = True

[docs]    def final(self, searcher, docnum, score):
        # Arbitrary for now
        reasonable_hits = 100.0

        stored_times_downloaded = searcher.stored_fields(docnum)["times_downloaded"]
        if not isinstance(stored_times_downloaded, int):
            times_downloaded = int(stored_times_downloaded)
        else:
            times_downloaded = stored_times_downloaded
        # Add 1 to prevent 0 being divided
        if times_downloaded == 0:
            times_downloaded = 1
        popularity_modifier = times_downloaded / reasonable_hits

        cert_modifier = 2 if searcher.stored_fields(docnum)["approved"] == "yes" else 1

        # Adjust the computed score for this document by the popularity
        # and by the certification level.
        final_score = score * popularity_modifier * cert_modifier
        return final_score


[docs]class RepoSearch:
[docs]    def search(self, trans, search_term, page, page_size, boosts):
        """
        Perform the search on the given search_term

        :param search_term: unicode encoded string with the search term(s)
        :param boosts: namedtuple containing custom boosts for searchfields, see api/repositories.py
        :param page_size: integer defining a length of one page
        :param page: integer with the number of page requested

        :returns results: dictionary containing hits themselves and the hits summary
        """
        log.debug(f"raw search query: #{str(search_term)}")
        lower_search_term = search_term.lower()
        allow_query, search_term_without_filters = self._parse_reserved_filters(lower_search_term)
        log.debug(f"term without filters: #{str(search_term_without_filters)}")

        whoosh_index_dir = trans.app.config.whoosh_index_dir
        index_exists = whoosh.index.exists_in(whoosh_index_dir)
        if index_exists:
            index = whoosh.index.open_dir(whoosh_index_dir)
            try:
                # Some literature about BM25F:
                # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf
                # http://en.wikipedia.org/wiki/Okapi_BM25
                # __Basically__ the higher number the bigger weight.
                repo_weighting = RepoWeighting(
                    field_B={
                        "name_B": boosts.repo_name_boost,
                        "description_B": boosts.repo_description_boost,
                        "long_description_B": boosts.repo_long_description_boost,
                        "homepage_url_B": boosts.repo_homepage_url_boost,
                        "remote_repository_url_B": boosts.repo_remote_repository_url_boost,
                        "repo_owner_username_B": boosts.repo_owner_username_boost,
                        "categories_B": boosts.categories_boost,
                    }
                )
                searcher = index.searcher(weighting=repo_weighting)
                parser = MultifieldParser(
                    [
                        "name",
                        "description",
                        "long_description",
                        "homepage_url",
                        "remote_repository_url",
                        "repo_owner_username",
                        "categories",
                    ],
                    schema=schema,
                )

                # If user query has just filters prevent wildcard search.
                if len(search_term_without_filters) < 1:
                    user_query = Every("name")
                    sortedby = "name"
                else:
                    user_query = parser.parse(f"*{search_term_without_filters}*")
                    sortedby = ""
                try:
                    hits = searcher.search_page(
                        user_query, page, pagelen=page_size, filter=allow_query, terms=True, sortedby=sortedby
                    )
                    log.debug(f"total hits: {str(len(hits))}")
                    log.debug(f"scored hits: {str(hits.scored_length())}")
                except ValueError:
                    raise ObjectNotFound("The requested page does not exist.")
                results = {}
                results["total_results"] = str(len(hits))
                results["page"] = str(page)
                results["page_size"] = str(page_size)
                results["hits"] = []
                for hit in hits:
                    log.debug(f"matched terms: {str(hit.matched_terms())}")
                    hit_dict = {}
                    hit_dict["id"] = trans.security.encode_id(hit.get("id"))
                    hit_dict["repo_owner_username"] = hit.get("repo_owner_username")
                    hit_dict["name"] = hit.get("name")
                    hit_dict["long_description"] = hit.get("long_description")
                    hit_dict["remote_repository_url"] = hit.get("remote_repository_url")
                    hit_dict["homepage_url"] = hit.get("homepage_url")
                    hit_dict["description"] = hit.get("description")
                    hit_dict["last_updated"] = hit.get("last_updated")
                    hit_dict["full_last_updated"] = hit.get("full_last_updated")
                    hit_dict["repo_lineage"] = hit.get("repo_lineage")
                    hit_dict["categories"] = hit.get("categories")
                    hit_dict["approved"] = hit.get("approved")
                    hit_dict["times_downloaded"] = hit.get("times_downloaded")
                    results["hits"].append({"repository": hit_dict, "score": hit.score})
                return results
            finally:
                searcher.close()
        else:
            raise exceptions.InternalServerError("The search index file is missing.")

    def _parse_reserved_filters(self, search_term):
        """
        Support github-like filters for narrowing the results.
        Order of chunks does not matter, only recognized
        filter names are allowed.

        :param search_term: the original search str from user input

        :returns allow_query: whoosh Query object used for filtering
            results of searching in index
        :returns search_term_without_filters: str that represents user's
            search phrase without the filters

        >>> rs = RepoSearch()
        >>> rs._parse_reserved_filters("category:assembly")
        (And([Term('categories', 'assembly')]), '')
        >>> rs._parse_reserved_filters("category:assembly abyss")
        (And([Term('categories', 'assembly')]), 'abyss')
        >>> rs._parse_reserved_filters("category:'Climate Analysis' psy_maps")
        (And([Term('categories', 'Climate Analysis')]), 'psy_maps')
        >>> rs._parse_reserved_filters("climate category:'Climate Analysis' owner:'bjoern gruening' psy_maps")
        (And([Term('categories', 'Climate Analysis'), Term('repo_owner_username', 'bjoern gruening')]), 'climate psy_maps')
        >>> rs._parse_reserved_filters("climate category:'John Says This Fails' owner:'bjoern gruening' psy_maps")
        (And([Term('categories', 'John Says This Fails'), Term('repo_owner_username', 'bjoern gruening')]), 'climate psy_maps')
        >>> rs._parse_reserved_filters("climate o:'bjoern gruening' middle strings c:'John Says This Fails' psy_maps")
        (And([Term('repo_owner_username', 'bjoern gruening'), Term('categories', 'John Says This Fails')]), 'climate middle strings psy_maps')
        >>> rs._parse_reserved_filters("abyss category:assembly")
        (And([Term('categories', 'assembly')]), 'abyss')
        >>> rs._parse_reserved_filters("abyss category:assembly greg")
        (And([Term('categories', 'assembly')]), 'abyss greg')
        >>> rs._parse_reserved_filters("owner:greg")
        (And([Term('repo_owner_username', 'greg')]), '')
        >>> rs._parse_reserved_filters("owner:greg category:assembly abyss")
        (And([Term('repo_owner_username', 'greg'), Term('categories', 'assembly')]), 'abyss')
        >>> rs._parse_reserved_filters("meaningoflife:42")
        (None, 'meaningoflife:42')
        """
        filters = {
            "category": "categories",
            "c": "categories",
            "owner": "repo_owner_username",
            "o": "repo_owner_username",
        }
        allow_query, search_term_without_filters = parse_filters(search_term, filters)
        allow_query = (
            And([Term(t, v) for (t, v, _) in allow_query] if len(allow_query) > 0 else None) if allow_query else None
        )
        return allow_query, search_term_without_filters