blob: 5b019673dd02402069307f9d08af03fd3d6afe5a [file] [log] [blame]
#!/usr/bin/python
from __future__ import print_function, division
import sys
from subprocess import call
import os
import shlex
import argparse
import textwrap
import sqlite3
import socket # for hostname
import getpass # for username
from util import (walk_runs, get_trailing_num, get_result_file,
get_task_table_name, immediate_subdir, natural_sort,
is_int, is_float, convert_strictest)
nullval = '-1'
type_map = {int: "INT", float: "REAL", str: "TEXT"}
# main program sequence
def main():
params = Params()
parse_args(params)
# operate on database
db = sqlite3.connect(params.database)
db.row_factory = sqlite3.Row
while params.task_dir:
if (params.clean):
drop_table(params, db)
else:
verify_paths(params)
initialize_tracked_columns(params, db)
update_db(params, db)
load_next_task(params)
db.close()
def verify_paths(params):
"""verify paths exist as either relational or absolute paths, modifying them if necessary"""
if os.path.isdir(params.task_dir):
walk_runs(params, check_result_exists)
else:
print(params.task_dir, " directory does not exist")
def update_db(params, db):
# check if table for task exists; if not then create it
task_table_name = params.task_table_name
create_table(params, db, task_table_name)
# load up latest run and parsed date for task
def check_last_runs_table(runs):
natural_sort(runs)
cursor = db.cursor()
cursor.execute("SELECT MAX({}), MAX({}) FROM {}".format("parsed_date", "run", params.task_table_name))
row = cursor.fetchone()
if row[0]:
# storing in database truncates decimal, so add 1 second
last_parsed_date = row[0] + 1
last_run = row[1]
print("last parsed date", last_parsed_date)
print("last run", last_run)
params.last_parsed_date = last_parsed_date
params.last_run = last_run
else:
print("first population")
params.last_run = 0
# else first run, nothing in table yet
def add_run_to_db(params, run):
run_number = get_trailing_num(run)
resfilename = get_result_file(params, params.run_prefix, run_number)
try:
parsed_date = os.path.getmtime(resfilename)
# throw away unless newer than latest or run number greater than maximum
if parsed_date <= params.last_parsed_date and run_number <= params.last_run:
return
except OSError:
print("file {} not found; skipping".format(resfilename))
return
params.last_run += 1
print("run {} added ({}) ({})".format(run_number, params.last_run, parsed_date))
with open(resfilename, 'r') as res:
# make sure table is compatible with run data by inserting any new columns
# always called "run" in table (converted from whatever prefix users use)
result_params = ["run", "parsed_date"]
result_params.extend(res.readline().split('\t'))
if result_params[-1] == '\n':
result_params.pop()
result_params = ["".join(('\"',p.strip(),'\"')) for p in result_params]
pre_sample_pos = res.tell()
result_params_sample = res.readline().split('\t')
# go back to presample location for normal line iteration
res.seek(pre_sample_pos, os.SEEK_SET)
# add new column to table
for c in range(len(result_params)):
if result_params[c] not in params.tracked_columns:
print("ADDING {} as new column".format(result_params[c]))
add_column_to_table(params, db, result_params[c], result_params_sample[c-2]) # -2 accounts for run and parsed date
# add value rows
rows_to_add = []
for line in res:
# run number and parsed_date are always recorded
result_params_val = [params.last_run, parsed_date]
result_params_val.extend(line.split('\t'))
if result_params_val[-1] == '\n':
result_params_val.pop()
# something must be wrong here
if len(result_params_val) > len(result_params):
print("There are {} values for only {} parameters in run {}; \
skipping run".format(len(result_params_val), len(result_params), run_number))
# skip this run
params.last_run -= 1
return
# for when the last column value is the empty string
while len(result_params_val) < len(result_params):
result_params_val.append('')
rows_to_add.append(tuple(convert_strictest(param) if param != nullval else None for param in result_params_val))
param_placeholders = ("?,"*len(result_params)).rstrip(',')
# specify columns to insert into since ordering of columns in file may not match table
insert_rows_command = "INSERT OR IGNORE INTO {} ({}) VALUES ({})".format(
params.task_table_name,
','.join(result_params),
param_placeholders)
cursor = db.cursor()
cursor.executemany(insert_rows_command, rows_to_add)
walk_runs(params, add_run_to_db, check_last_runs_table)
db.commit()
def drop_table(params, db):
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS {}".format(params.task_table_name))
def create_table(params, db, task_table_name):
# creates table schema based on the result file of run 1
with open(get_result_file(params, params.run_prefix, 1), 'r') as res:
result_params = [param.strip() for param in res.readline().rstrip().split('\t')]
result_params_sample = [value.strip() for value in res.readline().rstrip().split('\t')]
while len(result_params_sample) < len(result_params):
result_params_sample.append('')
# check if table name already exists
cursor = db.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='{}'".format(params.task_table_name.strip("[]")))
if cursor.fetchone():
return
# table identifiers cannot be parameterized, so have to use string concatenation
create_table_command = '''CREATE TABLE IF NOT EXISTS {}('''.format(task_table_name)
# include one item for each result_params
create_table_command += "{} INTEGER, ".format(params.run_prefix)
# time is in seconds since epoch
create_table_command += "parsed_date REAL, "
for p in range(len(result_params)):
p_schema = "".join(('\"',result_params[p].strip(),'\" ', type_map.get(type(convert_strictest(result_params_sample[p])), "TEXT")))
p_schema += ", "
create_table_command += p_schema
# treat with unique keys
primary_keys = "PRIMARY KEY({},".format(params.run_prefix) # run always considered primary key
for primary_key in params.key_params:
if primary_key not in result_params:
print("{} does not exist in result file of run 1".format(primary_key))
sys.exit(3)
primary_keys += primary_key
primary_keys += ','
primary_keys = primary_keys[:-1] # remove trailing ,
primary_keys += ')'
create_table_command += primary_keys
create_table_command += ')'
print(create_table_command)
cursor = db.cursor()
cursor.execute(create_table_command)
# reinitialize tracked columns for newly created table
initialize_tracked_columns(params, db)
def initialize_tracked_columns(params, db):
cursor = db.cursor()
cursor.execute("PRAGMA table_info(%s)" % params.task_table_name)
column_info = cursor.fetchall()
column_names = set()
for info in column_info:
column_names.add('\"' + info[1] + '\"')
setattr(params, 'tracked_columns', column_names)
# print('tracked params: ', end='')
# print(params.tracked_columns)
# generating program parameters
class Params(object):
def __iter__(self):
for attr, value in self.__dict__.iteritems():
yield attr, value
def parse_args(ns=None):
"""parse arguments from command line and return as namespace object"""
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent("""\
populates a central database with benchmark information
Generated database:
Each task will become its own table, identified by its absolute path.
Each table will have as many columns as the result file the first time
the table is generated. The columns of the result file should not change
after the table is generated."""),
usage="%(prog)s <task_dir> [OPTIONS]")
# arguments should either end with _dir or _file for use by other functions
parser.add_argument("task_list",
nargs='+',
default=[],
help="directories (relative or absolute) holding runs")
parser.add_argument("--root_directory",
default="",
help="where the task paths are relative to (only set if task paths are relative)")
parser.add_argument("-f", "--result_file",
default="parse_results.txt",
help="result file to look for inside each run; default: %(default)s")
parser.add_argument("-p", "--run_prefix",
default="run",
help="prefix of run directories under the task directory; default: %(default)s")
parser.add_argument("-s", "--parse_script",
default="echo parsed result file not found {run_number} for {task_name}",
help="bash command string to run if parsed file not found;\
use {run_number} and {task_name} to retrieve run information;\
default: %(default)s")
parser.add_argument("-d", "--database",
default="results.db",
help="name of database to store results in; default: %(default)s")
parser.add_argument("--clean",
action='store_true',
default=False,
help="clean (remove) the table for this task; exits afterwards")
parser.add_argument("-k","--key_params", required=True,
nargs='+',
default=[],
help="the combination of key parameters that defines a unique benchmark;")
params = parser.parse_args(namespace=ns)
params.database = os.path.expanduser(params.database)
# initialize to -1 as everything valid must be greater than -1
setattr(params, 'last_parsed_date', -1)
setattr(params, 'last_run', -1)
# if a task list is given (where each line is the path to a task)
if params.root_directory:
if params.root_directory[-1] != '/':
params.root_directory += '/'
# make directory if it does not exist
if not os.path.exists(params.root_directory):
os.makedirs(directory)
params.task_list = [params.root_directory + task for task in params.task_list]
if os.path.isfile(params.task_list[0]):
print("given task list")
task_list = []
with open(params.task_list[0], 'r') as tl:
for line in tl:
task_list.append(params.root_directory + line.rstrip())
params.task_list = task_list
# load first task
load_next_task(params)
return params
def load_next_task(params):
if not params.task_list:
setattr(params, 'task_dir', None)
return
active_task = params.task_list.pop()
print("active task:", active_task)
setattr(params, 'task_dir', active_task)
# name of task
setattr(params, 'task_name', params.task_dir.split('/')[-1])
# path to task
setattr(params, 'task_path', os.path.abspath(os.path.join(params.task_dir, os.pardir)))
# table name in the database
setattr(params, 'task_table_name', get_task_table_name(params))
return params
# walk operations; all take params and run as arguments
def check_result_exists(params, run):
run_number = get_trailing_num(run)
if not os.path.isfile(get_result_file(params, params.run_prefix, run_number)):
parsed_call = params.parse_script.format(
task_dir = params.task_dir,
task_name = params.task_name,
run_number = get_trailing_num(run))
print(run, " called ", parsed_call)
parsed_call = shlex.split(parsed_call)
parsed_call[0] = os.path.expanduser(parsed_call[0])
call(parsed_call)
else:
print(run, " OK")
def add_column_to_table(params, db, column_name, sample_val):
cursor = db.cursor()
col_type = type_map.get(type(convert_strictest(sample_val)), "TEXT")
cursor.execute("ALTER TABLE {table} ADD COLUMN {col} {type}".format(
table=params.task_table_name, col=column_name, type=col_type))
params.tracked_columns.add(column_name)
if __name__ == "__main__":
main()