Source code for pilotscope.PilotConfig

import os

from pilotscope.Common.SSHConnector import SSHConnector
from pilotscope.PilotEnum import DataFetchMethodEnum, DatabaseEnum, TrainSwitchMode, SparkSQLDataSourceEnum
import logging

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
pilot_logger = logging.getLogger("PilotScope")


[docs]class PilotConfig: """ The PilotConfig class is used for storing and managing configuration information for PilotScope, including the host address of PilotScope, the name of the database to connect to, the username and password to log into the database, etc. """
[docs] def __init__(self, db_type: DatabaseEnum, db="stats_tiny", pilotscope_core_host="localhost", user_data_db_name="PilotScopeUserData", sql_execution_timeout=300, once_request_timeout=300) -> None: """ Initialize the PilotConfig. :param db_type: the type of database, i.e. PostgreSQL, SparkSQL, etc. :param db: the name of connected database :param pilotscope_core_host: the host address of PilotScope in ML side. :param user_data_db_name: the created database name for saving the user data. If users want to visit these data, they can set db=user_data_db_name. :param sql_execution_timeout: the timeout of sql execution, unit: second :param once_request_timeout: the timeout of once request, unit: second """ self.db_type: DatabaseEnum = db_type self.pilotscope_core_host = pilotscope_core_host self.data_fetch_method = DataFetchMethodEnum.HTTP self.db = db self.user_data_db_name = user_data_db_name self._enable_deep_control = False self._is_local = True self.sql_execution_timeout = sql_execution_timeout self.once_request_timeout = once_request_timeout # pretraining self.pretraining_model = TrainSwitchMode.WAIT
def __str__(self): return self.__dict__.__str__()
[docs] def print(self): """ Print the configuration information of PilotScope. """ for key, value in self.__dict__.items(): print("{} = {}".format(key, value))
[docs]class PostgreSQLConfig(PilotConfig):
[docs] def __init__(self, pilotscope_core_host="localhost", db_host="localhost", db_port="5432", db_user="pilotscope", db_user_pwd="pilotscope", db="stats_tiny") -> None: """ :param pilotscope_core_host: the host address of PilotScope in ML side. :param db_host: the host address of database :param db_port: the port of database :param db_user: the username to log into the database :param db_user_pwd: the password to log into the database """ super().__init__(db_type=DatabaseEnum.POSTGRESQL, db=db, pilotscope_core_host=pilotscope_core_host) self.db_host = db_host self.db_port = db_port self.db_user = db_user self.db_user_pwd = db_user_pwd # for deep control self.pg_bin_path = None self.pgdata = None self.pg_ctl = None self.db_config_path = None self.backup_db_config_path = None self.db_host_user = None self.db_host_pwd = None self.db_host_port = None
[docs] def enable_deep_control_local(self, pg_bin_path: str, pg_data_path: str): """ Enable deep control for PostgreSQL, such as starting and stopping database, changing config file, etc. If you do not need these functions, it is not necessary to set these values. If the database and PilotScope Core are on the same machine, you can use this function, i.e., pilotscope_core_host != db_host. Otherwise, use `enable_deep_control_remote` :param pg_bin_path: the directory of binary file of postgresql, e.g., /postgres_install_path/bin :param pg_data_path: location of the database data storage :param db_host_user: the username to log into the database host :param db_host_pwd: the password to log into the database host """ self._enable_deep_control = True self.pg_bin_path = pg_bin_path self.pgdata = pg_data_path self.backup_db_config_path = os.path.join(pg_data_path, "pilotscope_postgresql_backup.conf") self.db_config_path = os.path.join(pg_data_path, "postgresql.conf") self.pg_ctl = os.path.join(pg_bin_path, "pg_ctl") with open(self.db_config_path, "r") as f: with open(self.backup_db_config_path, "w") as w: w.write(f.read()) self.pg_ctl = os.path.join(pg_bin_path, "pg_ctl")
[docs] def enable_deep_control_remote(self, pg_bin_path, pg_data_path, db_host_user, db_host_pwd, db_host_ssh_port=22): """ Enable deep control for PostgreSQL, such as starting and stopping database, changing config file, etc. If you do not need these functions, it is not necessary to set these values. If the database and PilotScope Core are not on the same machine, you can use this function, i.e., pilotscope_core_host != db_host. Otherwise, use `enable_deep_control_local` :param pg_bin_path: the directory of binary file of postgresql, e.g., /postgres_install_path/bin :param pg_data_path: location of the database data storage :param db_host_user: the username to log into the database host :param db_host_pwd: the password to log into the database host :param db_host_ssh_port: the port of ssh service on the database host """ self._is_local = False self._enable_deep_control = True self.db_host_user = db_host_user self.db_host_pwd = db_host_pwd self.db_host_port = db_host_ssh_port self.pg_bin_path = pg_bin_path self.pgdata = pg_data_path self.backup_db_config_path = os.path.join(pg_data_path, "pilotscope_postgresql_backup.conf") self.db_config_path = os.path.join(pg_data_path, "postgresql.conf") self.pg_ctl = os.path.join(pg_bin_path, "pg_ctl") ssh_conn = SSHConnector(self.db_host, self.db_host_user, self.db_host_pwd, self.db_host_port) ssh_conn.connect() with ssh_conn.open_file(self.db_config_path, "r") as f: with ssh_conn.open_file(self.backup_db_config_path, "w") as w: w.write(f.read()) ssh_conn.close() self.pg_ctl = os.path.join(pg_bin_path, "pg_ctl")
[docs]class SparkConfig(PilotConfig):
[docs] def __init__(self, app_name="testApp", master_url="local[*]") -> None: """ :param app_name: the name of the application of Spark :param master_url: the master URL of Spark cluster """ super().__init__(db_type=DatabaseEnum.SPARK, db=None) # spark self.app_name = app_name self.master_url = master_url if self.master_url != "local[*]": raise NotImplementedError( "PilotScope only support master_url=local[*]. The more functionalities is developing") # postgresql datasource self.datasource_type = None self.db_host = None self.db_port = None self.db_user = None self.db_user_pwd = None self.jdbc = "org.postgresql:postgresql:42.6.0" self.spark_configs = {} self.set_spark_session_config({ "spark.sql.pilotscope.enabled": True })
[docs] def set_spark_session_config(self, config: dict): self.spark_configs.update(config) return self
[docs] def enable_cardinality_estimation(self): """ Spark SQL support cost-based optimization but it is disabled by default. If you need to enable `pull_subquery_card` and `push_card`, please call this function, and PilotScope will set the corresponding parameters. This will consume more time, but the performance of the SQL will be better. """ self.set_spark_session_config({ "spark.sql.cbo.enabled": True, "spark.sql.cbo.joinReorder.enabled": True })
[docs] def use_postgresql_datasource(self, db_host="localhost", db_port="5432", db_user="postgres", db_user_pwd="postgres", db="stats_tiny"): """ Set up a PostgreSQL data source. :param db_host: the host of postgresql, defaults to "localhost" :param db_port: the network port of postgresql, defaults to "5432" :param db_user: the username to log into postgresql, defaults to "postgres" :param db_user_pwd: the password of the user, defaults to "postgres" :param db: database name, defaults to "stats_tiny" """ self.datasource_type = SparkSQLDataSourceEnum.POSTGRESQL self.db_host = db_host self.db_port = db_port self.db = db self.db_user = db_user self.db_user_pwd = db_user_pwd