Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
itmo-sai-code
AutoTM
Commits
31c0779e
Commit
31c0779e
authored
1 year ago
by
MK
Browse files
Options
Download
Email Patches
Plain Diff
Fix fname
parent
05e92610
dev/topic-based-text-splitter
1 merge request
!19
Dev/topic based text splitter
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
README.md
+0
-2
README.md
autotm/algorithms_for_tuning/genetic_algorithm/mutation.py
+33
-2
autotm/algorithms_for_tuning/genetic_algorithm/mutation.py
autotm/fitness/tm.py
+1
-1
autotm/fitness/tm.py
autotm/preprocessing/dictionaries_preparation.py
+2
-2
autotm/preprocessing/dictionaries_preparation.py
autotm/preprocessing/text_preprocessing.py
+1
-1
autotm/preprocessing/text_preprocessing.py
with
37 additions
and
8 deletions
+37
-8
README.md
View file @
31c0779e
<p
align=
"center"
>
<picture>
<img
src=
"docs/img/MyLogo.png"
alt=
"Library scheme"
height=
"200"
/>
</picture>
</p>
<h2
align=
"center"
>
...
...
This diff is collapsed.
Click to expand it.
autotm/algorithms_for_tuning/genetic_algorithm/mutation.py
View file @
31c0779e
...
...
@@ -11,12 +11,43 @@ def mutation_one_param(
high_spm
:
float
,
low_n
:
int
,
high_n
:
int
,
low_back
:
floa
t
,
high_back
:
floa
t
,
low_back
:
in
t
,
high_back
:
in
t
,
low_decor
:
float
,
high_decor
:
float
,
elem_mutation_prob
:
float
=
0.1
,
):
"""
One-point mutation
Checking the probability of mutation for each of the elements
Parameters
----------
individ: List[float]
Individual to be processed
low_spb: float
The lower possible bound for sparsity regularizer of back topics
high_spb: float
The higher possible bound for sparsity regularizer of back topics
low_spm: float
The lower possible bound for sparsity regularizer of specific topics
high_spm: float
The higher possible bound for sparsity regularizer of specific topics
low_n: int
The lower possible bound for amount of iterations between stages
high_n: int
The higher possible bound for amount of iterations between stages
low_back:
The lower possible bound for amount of back topics
high_back:
The higher possible bound for amount of back topics
Returns
----------
Updated individuals with exchanged chromosome parts
"""
for
i
in
range
(
len
(
individ
)):
if
random
.
random
()
<=
elem_mutation_prob
:
if
i
in
[
2
,
3
]:
...
...
This diff is collapsed.
Click to expand it.
autotm/fitness/tm.py
View file @
31c0779e
...
...
@@ -59,7 +59,7 @@ class Dataset:
_ppmi_dict_df_path
:
str
=
"ppmi_df.txt"
_ppmi_dict_tf_path
:
str
=
"ppmi_tf.txt"
_mutual_info_dict_path
:
str
=
"mutual_info_dict.pkl"
_texts_path
:
str
=
"p
pp
.csv"
_texts_path
:
str
=
"p
rep_df
.csv"
_labels_path
=
"labels.pkl"
def
__init__
(
self
,
base_path
:
str
,
topic_count
:
int
):
...
...
This diff is collapsed.
Click to expand it.
autotm/preprocessing/dictionaries_preparation.py
View file @
31c0779e
...
...
@@ -322,7 +322,7 @@ def mutual_info_dict_preparation(fname):
def
prepare_all_artifacts
(
save_path
:
str
):
DATASET_PATH
=
os
.
path
.
join
(
save_path
,
"p
pp
.csv"
)
DATASET_PATH
=
os
.
path
.
join
(
save_path
,
"p
rep_df
.csv"
)
BATCHES_DIR
=
os
.
path
.
join
(
save_path
,
"batches"
)
WV_PATH
=
os
.
path
.
join
(
save_path
,
"test_set_data_voc.txt"
)
COOC_DICTIONARY_PATH
=
os
.
path
.
join
(
save_path
,
"cooc_dictionary.txt"
)
...
...
@@ -333,7 +333,7 @@ def prepare_all_artifacts(save_path: str):
ppmi_dict_df
=
os
.
path
.
join
(
save_path
,
"ppmi_df.txt"
)
ppmi_dict_tf
=
os
.
path
.
join
(
save_path
,
"ppmi_tf.txt"
)
MUTUAL_INFO_DICT_PATH
=
os
.
path
.
join
(
save_path
,
"mutual_info_dict.pkl"
)
DOCUMENTS_TO_BATCH_PATH
=
os
.
path
.
join
(
save_path
,
"p
pp
.csv"
)
DOCUMENTS_TO_BATCH_PATH
=
os
.
path
.
join
(
save_path
,
"p
rep_df
.csv"
)
# TODO: check why batch vectorizer is returned (unused further)
prepare_batch_vectorizer
(
...
...
This diff is collapsed.
Click to expand it.
autotm/preprocessing/text_preprocessing.py
View file @
31c0779e
...
...
@@ -164,7 +164,7 @@ def process_dataset(
:return:
"""
os
.
makedirs
(
save_path
,
exist_ok
=
True
)
save_path
=
os
.
path
.
join
(
save_path
,
"p
pp
.csv"
)
save_path
=
os
.
path
.
join
(
save_path
,
"p
rep_df
.csv"
)
data
=
pd
.
read_csv
(
fname
)
if
isinstance
(
fname
,
str
)
else
cast
(
pd
.
DataFrame
,
fname
)
data
=
parallelize_dataframe
(
data
,
lemmatize_text
,
n_cores
,
lang
=
lang
,
col_to_process
=
col_to_process
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment
Menu
Projects
Groups
Snippets
Help