Warning
This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.
Source code for tool_shed.webapp.search.repo_search
"""Module for searching the toolshed repositories"""
import logging
import whoosh.index
from whoosh import scoring
from whoosh.fields import (
KEYWORD,
NUMERIC,
Schema,
STORED,
TEXT,
)
from whoosh.qparser import MultifieldParser
from whoosh.query import (
And,
Every,
Term,
)
from galaxy import exceptions
from galaxy.exceptions import ObjectNotFound
from galaxy.util.search import parse_filters
log = logging.getLogger(__name__)
schema = Schema(
id=NUMERIC(stored=True),
name=TEXT(field_boost=1.7, stored=True),
description=TEXT(field_boost=1.5, stored=True),
long_description=TEXT(stored=True),
homepage_url=TEXT(stored=True),
remote_repository_url=TEXT(stored=True),
repo_owner_username=TEXT(stored=True),
categories=KEYWORD(stored=True, commas=True, scorable=True),
times_downloaded=STORED,
approved=STORED,
last_updated=STORED,
repo_lineage=STORED,
full_last_updated=STORED,
)
[docs]class RepoWeighting(scoring.BM25F):
"""
Affect the BM25G scoring model through the final method.
source: https://groups.google.com/forum/#!msg/whoosh/1AKNbW8R_l8/XySW0OecH6gJ
"""
use_final = True
[docs] def final(self, searcher, docnum, score):
# Arbitrary for now
reasonable_hits = 100.0
stored_times_downloaded = searcher.stored_fields(docnum)["times_downloaded"]
if not isinstance(stored_times_downloaded, int):
times_downloaded = int(stored_times_downloaded)
else:
times_downloaded = stored_times_downloaded
# Add 1 to prevent 0 being divided
if times_downloaded == 0:
times_downloaded = 1
popularity_modifier = times_downloaded / reasonable_hits
cert_modifier = 2 if searcher.stored_fields(docnum)["approved"] == "yes" else 1
# Adjust the computed score for this document by the popularity
# and by the certification level.
final_score = score * popularity_modifier * cert_modifier
return final_score
[docs]class RepoSearch:
[docs] def search(self, trans, search_term, page, page_size, boosts):
"""
Perform the search on the given search_term
:param search_term: unicode encoded string with the search term(s)
:param boosts: namedtuple containing custom boosts for searchfields, see api/repositories.py
:param page_size: integer defining a length of one page
:param page: integer with the number of page requested
:returns results: dictionary containing hits themselves and the hits summary
"""
log.debug(f"raw search query: #{str(search_term)}")
lower_search_term = search_term.lower()
allow_query, search_term_without_filters = self._parse_reserved_filters(lower_search_term)
log.debug(f"term without filters: #{str(search_term_without_filters)}")
whoosh_index_dir = trans.app.config.whoosh_index_dir
index_exists = whoosh.index.exists_in(whoosh_index_dir)
if index_exists:
index = whoosh.index.open_dir(whoosh_index_dir)
try:
# Some literature about BM25F:
# http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf
# http://en.wikipedia.org/wiki/Okapi_BM25
# __Basically__ the higher number the bigger weight.
repo_weighting = RepoWeighting(
field_B={
"name_B": boosts.repo_name_boost,
"description_B": boosts.repo_description_boost,
"long_description_B": boosts.repo_long_description_boost,
"homepage_url_B": boosts.repo_homepage_url_boost,
"remote_repository_url_B": boosts.repo_remote_repository_url_boost,
"repo_owner_username_B": boosts.repo_owner_username_boost,
"categories_B": boosts.categories_boost,
}
)
searcher = index.searcher(weighting=repo_weighting)
parser = MultifieldParser(
[
"name",
"description",
"long_description",
"homepage_url",
"remote_repository_url",
"repo_owner_username",
"categories",
],
schema=schema,
)
# If user query has just filters prevent wildcard search.
if len(search_term_without_filters) < 1:
user_query = Every("name")
sortedby = "name"
else:
user_query = parser.parse(f"*{search_term_without_filters}*")
sortedby = ""
try:
hits = searcher.search_page(
user_query, page, pagelen=page_size, filter=allow_query, terms=True, sortedby=sortedby
)
log.debug(f"total hits: {str(len(hits))}")
log.debug(f"scored hits: {str(hits.scored_length())}")
except ValueError:
raise ObjectNotFound("The requested page does not exist.")
results = {}
results["total_results"] = str(len(hits))
results["page"] = str(page)
results["page_size"] = str(page_size)
results["hits"] = []
for hit in hits:
log.debug(f"matched terms: {str(hit.matched_terms())}")
hit_dict = {}
hit_dict["id"] = trans.security.encode_id(hit.get("id"))
hit_dict["repo_owner_username"] = hit.get("repo_owner_username")
hit_dict["name"] = hit.get("name")
hit_dict["long_description"] = hit.get("long_description")
hit_dict["remote_repository_url"] = hit.get("remote_repository_url")
hit_dict["homepage_url"] = hit.get("homepage_url")
hit_dict["description"] = hit.get("description")
hit_dict["last_updated"] = hit.get("last_updated")
hit_dict["full_last_updated"] = hit.get("full_last_updated")
hit_dict["repo_lineage"] = hit.get("repo_lineage")
hit_dict["categories"] = hit.get("categories")
hit_dict["approved"] = hit.get("approved")
hit_dict["times_downloaded"] = hit.get("times_downloaded")
results["hits"].append({"repository": hit_dict, "score": hit.score})
return results
finally:
searcher.close()
else:
raise exceptions.InternalServerError("The search index file is missing.")
def _parse_reserved_filters(self, search_term):
"""
Support github-like filters for narrowing the results.
Order of chunks does not matter, only recognized
filter names are allowed.
:param search_term: the original search str from user input
:returns allow_query: whoosh Query object used for filtering
results of searching in index
:returns search_term_without_filters: str that represents user's
search phrase without the filters
>>> rs = RepoSearch()
>>> rs._parse_reserved_filters("category:assembly")
(And([Term('categories', 'assembly')]), '')
>>> rs._parse_reserved_filters("category:assembly abyss")
(And([Term('categories', 'assembly')]), 'abyss')
>>> rs._parse_reserved_filters("category:'Climate Analysis' psy_maps")
(And([Term('categories', 'Climate Analysis')]), 'psy_maps')
>>> rs._parse_reserved_filters("climate category:'Climate Analysis' owner:'bjoern gruening' psy_maps")
(And([Term('categories', 'Climate Analysis'), Term('repo_owner_username', 'bjoern gruening')]), 'climate psy_maps')
>>> rs._parse_reserved_filters("climate category:'John Says This Fails' owner:'bjoern gruening' psy_maps")
(And([Term('categories', 'John Says This Fails'), Term('repo_owner_username', 'bjoern gruening')]), 'climate psy_maps')
>>> rs._parse_reserved_filters("climate o:'bjoern gruening' middle strings c:'John Says This Fails' psy_maps")
(And([Term('repo_owner_username', 'bjoern gruening'), Term('categories', 'John Says This Fails')]), 'climate middle strings psy_maps')
>>> rs._parse_reserved_filters("abyss category:assembly")
(And([Term('categories', 'assembly')]), 'abyss')
>>> rs._parse_reserved_filters("abyss category:assembly greg")
(And([Term('categories', 'assembly')]), 'abyss greg')
>>> rs._parse_reserved_filters("owner:greg")
(And([Term('repo_owner_username', 'greg')]), '')
>>> rs._parse_reserved_filters("owner:greg category:assembly abyss")
(And([Term('repo_owner_username', 'greg'), Term('categories', 'assembly')]), 'abyss')
>>> rs._parse_reserved_filters("meaningoflife:42")
(None, 'meaningoflife:42')
"""
filters = {
"category": "categories",
"c": "categories",
"owner": "repo_owner_username",
"o": "repo_owner_username",
}
allow_query, search_term_without_filters = parse_filters(search_term, filters)
allow_query = (
And([Term(t, v) for (t, v, _) in allow_query] if len(allow_query) > 0 else None) if allow_query else None
)
return allow_query, search_term_without_filters