Warning

This document is for an in-development version of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.job_metrics.instrumenters.collectl

"""The module describes the ``collectl`` job metrics plugin."""
import logging
import os
import shutil

from galaxy import util
from . import InstrumentPlugin
from .. import formatting
from ..collectl import (
    cli,
    processes,
    subsystems
)

log = logging.getLogger(__name__)

# By default, only grab statistics for user processes (as identified by
# username).
DEFAULT_PROCFILT_ON = "username"
DEFAULT_SUBSYSTEMS = "process"
# Set to zero to flush every collection.
DEFAULT_FLUSH_INTERVAL = "0"

FORMATTED_RESOURCE_TITLES = {
    "PCT": "Percent CPU Usage",
    "RSYS": "Disk Reads",
    "WSYS": "Disk Writes",
}

EMPTY_COLLECTL_FILE_MESSAGE = "Skipping process summary due to empty file... job probably did not run long enough for collectl to gather data."


class CollectlFormatter(formatting.JobMetricFormatter):

    def format(self, key, value):
        if key == "pid":
            return ("Process ID", int(value))
        elif key == "raw_log_path":
            return ("Relative Path of Full Collectl Log", value)
        elif key == "process_max_AccumT":
            return ("Job Runtime (System+User)", formatting.seconds_to_str(float(value)))
        else:
            _, stat_type, resource_type = key.split("_", 2)
            if resource_type.startswith("Vm"):
                value_str = "%s KB" % int(value)
            elif resource_type in ["RSYS", "WSYS"] and stat_type in ["count", "max", "sum"]:
                value_str = "%d (# system calls)" % int(value)
            else:
                value_str = str(value)
            resource_title = FORMATTED_RESOURCE_TITLES.get(resource_type, resource_type)
            return ("{} ({})".format(resource_title, stat_type), value_str)


[docs]class CollectlPlugin(InstrumentPlugin): """ Run collectl along with job to capture system and/or process data according to specified collectl subsystems. """ plugin_type = "collectl" formatter = CollectlFormatter()
[docs] def __init__(self, **kwargs): self.__configure_paths(kwargs) self.__configure_subsystems(kwargs) saved_logs_path = kwargs.get("saved_logs_path", "") if "app" in kwargs: log.debug("Found path for saved logs: %s" % saved_logs_path) saved_logs_path = kwargs["app"].config.resolve_path(saved_logs_path) self.saved_logs_path = saved_logs_path self.__configure_collectl_recorder_args(kwargs) self.summarize_process_data = util.asbool(kwargs.get("summarize_process_data", True)) self.log_collectl_program_output = util.asbool(kwargs.get("log_collectl_program_output", False)) if self.summarize_process_data: if subsystems.get_subsystem("process") not in self.subsystems: raise Exception("Collectl plugin misconfigured - cannot summarize_process_data without process subsystem being enabled.") process_statistics = kwargs.get("process_statistics", None) # None will let processes module use default set of statistics # defined there. self.process_statistics = processes.parse_process_statistics(process_statistics)
[docs] def pre_execute_instrument(self, job_directory): commands = [] # Capture PID of process so we can walk its ancestors when building # statistics for the whole job. commands.append('''echo "$$" > '%s' ''' % self.__pid_file(job_directory)) # Run collectl in record mode to capture process and system level # statistics according to supplied subsystems. commands.append(self.__collectl_record_command(job_directory)) return commands
[docs] def post_execute_instrument(self, job_directory): commands = [] # collectl dies when job script completes, perhaps capture pid of # collectl above and check if it is still alive to allow tracking if # collectl ran successfully through the whole job. return commands
[docs] def job_properties(self, job_id, job_directory): pid = open(self.__pid_file(job_directory)).read().strip() contents = os.listdir(job_directory) try: rel_path = next(iter(filter(self._is_instrumented_collectl_log, contents))) path = os.path.join(job_directory, rel_path) except StopIteration: message = "Failed to find collectl log in directory {}, files were {}".format(job_directory, contents) raise Exception(message) properties = dict( pid=int(pid), ) if self.saved_logs_path: destination_rel_dir = os.path.join(*util.directory_hash_id(job_id)) destination_rel_path = os.path.join(destination_rel_dir, rel_path) destination_path = os.path.join(self.saved_logs_path, destination_rel_path) destination_dir = os.path.dirname(destination_path) if not os.path.isdir(destination_dir): os.makedirs(destination_dir) shutil.copyfile(path, destination_path) properties["raw_log_path"] = destination_rel_path if self.summarize_process_data: # Run collectl in playback and generate statistics of interest summary_statistics = self.__summarize_process_data(pid, path) for statistic, value in summary_statistics: properties["process_%s" % "_".join(statistic)] = value return properties
def __configure_paths(self, kwargs): # 95% of time I would expect collectl to just be installed with apt or # yum, but if it is manually installed on not on path, allow # configuration of explicit path - and allow path to be different # between galaxy job handler (local_collectl_path) and compute node # (remote_collectl_path). collectl_path = kwargs.get("collectl_path", "collectl") self.remote_collectl_path = kwargs.get("remote_collectl_path", collectl_path) self.local_collectl_path = kwargs.get("local_collectl_path", collectl_path) def __configure_subsystems(self, kwargs): raw_subsystems_str = kwargs.get("subsystems", DEFAULT_SUBSYSTEMS) raw_subsystems = util.listify(raw_subsystems_str, do_strip=True) self.subsystems = [subsystems.get_subsystem(_) for _ in raw_subsystems] def __configure_collectl_recorder_args(self, kwargs): collectl_recorder_args = kwargs.copy() # Allow deployer to configure separate system and process intervals, # but if they specify just one - use it for both. Thinking here is this # plugin's most useful feature is the process level information so # this is likely what the deployer is attempting to configure. if "interval" in kwargs and "interval2" not in kwargs: collectl_recorder_args["interval2"] = kwargs["interval"] if "flush" not in kwargs: collectl_recorder_args["flush"] = DEFAULT_FLUSH_INTERVAL procfilt_on = kwargs.get("procfilt_on", DEFAULT_PROCFILT_ON).lower() # Calculate explicit arguments, rest can just be passed through from # constructor arguments. explicit_args = dict( collectl_path=self.remote_collectl_path, procfilt=procfilt_argument(procfilt_on), subsystems=self.subsystems, ) collectl_recorder_args.update(explicit_args) self.collectl_recorder_args = collectl_recorder_args def __summarize_process_data(self, pid, collectl_log_path): playback_cli_args = dict( collectl_path=self.local_collectl_path, playback_path=collectl_log_path, sep="9" ) if not os.stat(collectl_log_path).st_size: log.debug(EMPTY_COLLECTL_FILE_MESSAGE) return [] playback_cli = cli.CollectlCli(**playback_cli_args) return processes.generate_process_statistics(playback_cli, pid, self.process_statistics) def __collectl_recorder_cli(self, job_directory): cli_args = self.collectl_recorder_args.copy() cli_args["destination_path"] = self._instrument_file_path(job_directory, "log") return cli.CollectlCli(**cli_args) def __collectl_record_command(self, job_directory): collectl_cli = self.__collectl_recorder_cli(job_directory) if self.log_collectl_program_output: redirect_to = self._instrument_file_path(job_directory, "program_output") else: redirect_to = "/dev/null" return "{} > {} 2>&1 &".format( collectl_cli.build_command_line(), redirect_to, ) def __pid_file(self, job_directory): return self._instrument_file_path(job_directory, "pid") def _is_instrumented_collectl_log(self, filename): prefix = self._instrument_file_name("log") return filename.startswith(prefix) and filename.endswith(".raw.gz")
def procfilt_argument(procfilt_on): if procfilt_on == "username": return "U$USER" elif procfilt_on == "uid": return "u$UID" else: # Ensure it is empty of None if procfilt_on or procfilt_on.lower() != "none": raise Exception("Invalid procfilt_on argument encountered") return "" __all__ = ('CollectlPlugin', )