diff --git a/README.md b/README.md index 1af83634fef63592a721bb72e532b3dbaa7b5d58..75b7f3483d2f3d49bcc748d22e90c6b164c05112 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,6 @@ <p align="center"> -<picture> <img src="docs/img/MyLogo.png" alt="Library scheme" height="200"/> -</picture> </p> <h2 align="center"> diff --git a/autotm/algorithms_for_tuning/genetic_algorithm/mutation.py b/autotm/algorithms_for_tuning/genetic_algorithm/mutation.py index 5d8f0e23cff47c484c3abe4e01adff7ba85707e2..39c2c4d18b8b56ebe6d45553e611973a6ac20384 100644 --- a/autotm/algorithms_for_tuning/genetic_algorithm/mutation.py +++ b/autotm/algorithms_for_tuning/genetic_algorithm/mutation.py @@ -11,12 +11,43 @@ def mutation_one_param( high_spm: float, low_n: int, high_n: int, - low_back: float, - high_back: float, + low_back: int, + high_back: int, low_decor: float, high_decor: float, elem_mutation_prob: float = 0.1, ): + """ + One-point mutation + + Checking the probability of mutation for each of the elements + + Parameters + ---------- + individ: List[float] + Individual to be processed + low_spb: float + The lower possible bound for sparsity regularizer of back topics + high_spb: float + The higher possible bound for sparsity regularizer of back topics + low_spm: float + The lower possible bound for sparsity regularizer of specific topics + high_spm: float + The higher possible bound for sparsity regularizer of specific topics + low_n: int + The lower possible bound for amount of iterations between stages + high_n: int + The higher possible bound for amount of iterations between stages + low_back: + The lower possible bound for amount of back topics + high_back: + The higher possible bound for amount of back topics + + + Returns + ---------- + Updated individuals with exchanged chromosome parts + """ for i in range(len(individ)): if random.random() <= elem_mutation_prob: if i in [2, 3]: diff --git a/autotm/fitness/tm.py b/autotm/fitness/tm.py index 7e09efd37b5a07f8a0e4675fd7d0577c58182c67..c0dfe855a938075337f7cfa3d5a2b845a404a569 100644 --- a/autotm/fitness/tm.py +++ b/autotm/fitness/tm.py @@ -59,7 +59,7 @@ class Dataset: _ppmi_dict_df_path: str = "ppmi_df.txt" _ppmi_dict_tf_path: str = "ppmi_tf.txt" _mutual_info_dict_path: str = "mutual_info_dict.pkl" - _texts_path: str = "ppp.csv" + _texts_path: str = "prep_df.csv" _labels_path = "labels.pkl" def __init__(self, base_path: str, topic_count: int): diff --git a/autotm/preprocessing/dictionaries_preparation.py b/autotm/preprocessing/dictionaries_preparation.py index 1a6d7c7179955fd48aeb5ce3b371b7894eefab34..8ca3c39fa44c2144ebc685ee1e768431fbc0a766 100644 --- a/autotm/preprocessing/dictionaries_preparation.py +++ b/autotm/preprocessing/dictionaries_preparation.py @@ -322,7 +322,7 @@ def mutual_info_dict_preparation(fname): def prepare_all_artifacts(save_path: str): - DATASET_PATH = os.path.join(save_path, "ppp.csv") + DATASET_PATH = os.path.join(save_path, "prep_df.csv") BATCHES_DIR = os.path.join(save_path, "batches") WV_PATH = os.path.join(save_path, "test_set_data_voc.txt") COOC_DICTIONARY_PATH = os.path.join(save_path, "cooc_dictionary.txt") @@ -333,7 +333,7 @@ def prepare_all_artifacts(save_path: str): ppmi_dict_df = os.path.join(save_path, "ppmi_df.txt") ppmi_dict_tf = os.path.join(save_path, "ppmi_tf.txt") MUTUAL_INFO_DICT_PATH = os.path.join(save_path, "mutual_info_dict.pkl") - DOCUMENTS_TO_BATCH_PATH = os.path.join(save_path, "ppp.csv") + DOCUMENTS_TO_BATCH_PATH = os.path.join(save_path, "prep_df.csv") # TODO: check why batch vectorizer is returned (unused further) prepare_batch_vectorizer( diff --git a/autotm/preprocessing/text_preprocessing.py b/autotm/preprocessing/text_preprocessing.py index dc05cd38ae069b3e7b9d789bc83deaf7df928131..888313370e3e615df16a37b47fcbaba9a985318b 100644 --- a/autotm/preprocessing/text_preprocessing.py +++ b/autotm/preprocessing/text_preprocessing.py @@ -164,7 +164,7 @@ def process_dataset( :return: """ os.makedirs(save_path, exist_ok=True) - save_path = os.path.join(save_path, "ppp.csv") + save_path = os.path.join(save_path, "prep_df.csv") data = pd.read_csv(fname) if isinstance(fname, str) else cast(pd.DataFrame, fname) data = parallelize_dataframe( data, lemmatize_text, n_cores, lang=lang, col_to_process=col_to_process