Unverified Commit 30b8e0b2 authored by BarabanovaIrina's avatar BarabanovaIrina Committed by GitHub
Browse files

NLP init (#212)

* add text as InputData

* fix reqs

* move tfidf to eval strat level

* add tests

* delete text large data

* add data archiv && upacking

* add data package

* fixes && rebase

* fix nltk deps

* add DataTypesEnum.text

* rebase

* fix imports

* add vectorize test

* fix readme
parent 0d91b416
1 merge request!227PyPI release 0.2.0
Showing with 3413 additions and 7 deletions
+3413 -7
......@@ -130,6 +130,8 @@ Extended examples:
- Credit scoring problem, i.e. `binary classification task <https://github.com/nccr-itmo/FEDOT/blob/master/cases/credit_scoring_problem.py>`__
- Time series forecasting, i.e. `random process regression <https://github.com/nccr-itmo/FEDOT/blob/master/cases/metocean_forecasting_problem.py>`__
- Spam detection, i.e. `natural language preprocessing <https://github.com/nccr-itmo/FEDOT/blob/master/cases/spam_detection.py>`__
Also, several video tutorials are `available <https://www.youtube.com/playlist?list=PLlbcHj5ytaFUjAxpZf7FbEaanmqpDYhnc>`__ (in Russian).
......
This diff is collapsed.
File added
import os
import tarfile
from sklearn.metrics import roc_auc_score as roc_auc
from fedot.core.chains.chain import Chain
from fedot.core.chains.node import PrimaryNode, SecondaryNode
from fedot.core.data.data import InputData, train_test_data_setup
from fedot.core.data.preprocessing import TextPreprocessingStrategy, EmptyStrategy
def unpack_archived_data():
archive_path = os.path.abspath(os.path.join('data', 'spamham.tar.gz'))
if 'spamham' not in os.listdir(os.path.dirname(archive_path)):
with tarfile.open(archive_path) as file:
file.extractall(path=os.path.dirname(archive_path))
print('Unpacking finished')
def execute_chain_for_text_problem(train_data, test_data):
preproc_node = PrimaryNode('tfidf',
manual_preprocessing_func=TextPreprocessingStrategy)
model_node = SecondaryNode('multinb', nodes_from=[preproc_node],
manual_preprocessing_func=EmptyStrategy)
chain = Chain(nodes=[model_node, preproc_node])
chain.fit(train_data)
predicted = chain.predict(test_data)
roc_auc_metric = roc_auc(y_true=test_data.target, y_score=predicted.predict)
return roc_auc_metric
def run_text_problem_from_meta_file():
data_file_abspath = os.path.abspath(os.path.join('data', 'spam', 'spamham.csv'))
data = InputData.from_text_meta_file(meta_file_path=data_file_abspath)
train_data, test_data = train_test_data_setup(data, split_ratio=0.7)
metric = execute_chain_for_text_problem(train_data, test_data)
print(f'meta_file metric: {metric}')
def run_text_problem_from_files():
unpack_archived_data()
data_abspath = os.path.abspath(os.path.join('data', 'spamham'))
train_path = os.path.join(data_abspath, 'train')
test_path = os.path.join(data_abspath, 'test')
train_data = InputData.from_text_files(files_path=train_path)
test_data = InputData.from_text_files(files_path=test_path)
metric = execute_chain_for_text_problem(train_data, test_data)
print(f'origin files metric: {metric}')
def run_text_problem_from_saved_meta_file():
data_file_abspath = os.path.abspath(os.path.join('data', 'spamham', 'meta_train.csv'))
data = InputData.from_text_meta_file(meta_file_path=data_file_abspath)
train_data, test_data = train_test_data_setup(data, split_ratio=0.7)
metric = execute_chain_for_text_problem(train_data, test_data)
print(f'meta_file metric: {metric}')
if __name__ == '__main__':
run_text_problem_from_meta_file()
run_text_problem_from_files()
run_text_problem_from_saved_meta_file()
File added
File added
import os
import warnings
from copy import copy
from dataclasses import dataclass
......@@ -11,6 +12,7 @@ from fedot.core.algorithms.time_series.lagged_features import prepare_lagged_ts_
from fedot.core.data.preprocessing import ImputationStrategy
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.tasks import Task, TaskTypesEnum
from fedot.core.data.load_data import TextBatchLoader
@dataclass
......@@ -63,6 +65,44 @@ class Data:
return InputData(idx=idx, features=features, target=target, task=task, data_type=data_type)
@staticmethod
def from_text_meta_file(meta_file_path: str = None,
label: str = 'label',
task: Task = Task(TaskTypesEnum.classification),
data_type: DataTypesEnum = DataTypesEnum.text):
if os.path.isdir(meta_file_path):
raise ValueError("""CSV file expected but got directory""")
df_text = pd.read_csv(meta_file_path)
df_text = df_text.sample(frac=1).reset_index(drop=True)
messages = df_text['text'].astype('U').tolist()
features = np.array(messages)
target = df_text[label]
idx = [index for index in range(len(target))]
return InputData(idx=idx, features=features,
target=target, task=task, data_type=data_type)
@staticmethod
def from_text_files(files_path: str,
label: str = 'label',
task: Task = Task(TaskTypesEnum.classification),
data_type: DataTypesEnum = DataTypesEnum.text):
if os.path.isfile(files_path):
raise ValueError("""Path to the directory expected but got file""")
df_text = TextBatchLoader(path=files_path).extract()
features = df_text['text']
target = df_text[label]
idx = [index for index in range(len(target))]
return InputData(idx=idx, features=features,
target=target, task=task, data_type=data_type)
@dataclass
class InputData(Data):
......
import os
from abc import ABC, abstractmethod
import pandas as pd
class BatchLoader(ABC):
def __init__(self, path: str):
self.path = path
self.meta_df = None
self.target_name = 'label'
@abstractmethod
def extract(self) -> pd.DataFrame:
pass
def _extract_files_paths(self):
all_files = []
for root, dirs, files in os.walk(self.path):
files_paths = []
for name in files:
if not name.startswith('.'):
whole_file_path = os.path.join(root, name)
files_paths.append(whole_file_path)
if files:
all_files.extend(files_paths)
self._load_to_meta_df(all_files)
def _load_to_meta_df(self, files):
data_rows = []
for file in files:
dir_name = os.path.basename(os.path.dirname(file))
row = [file, dir_name]
data_rows.append(row)
self.meta_df = pd.DataFrame(data=data_rows,
columns=['file_path', f'{self.target_name}'])
# shuffle samples
self.meta_df = self.meta_df.sample(frac=1).reset_index(drop=True)
def export_to_csv(self, path: str = None):
if not path:
path = self.path
export_filename = f'meta_{os.path.basename(path)}.csv'
export_dirname = os.path.dirname(path)
self.meta_df.to_csv(os.path.join(export_dirname, export_filename))
else:
self.meta_df.to_csv(os.path.abspath(path))
class TextBatchLoader(BatchLoader):
def __init__(self, path: str):
if os.path.isfile(path):
raise ValueError('Expected directory path but got file')
super().__init__(path)
def extract(self, export: bool = True):
self._extract_files_paths()
content_list = []
for file_path in self.meta_df['file_path'].tolist():
with open(file_path, 'r') as text_file:
content = text_file.read()
content_list.append(content)
new_column = pd.Series(data=content_list, index=self.meta_df.index)
self.meta_df.insert(loc=self.meta_df.columns.get_loc('file_path'),
column='text',
value=new_column)
self.meta_df = self.meta_df.drop(['file_path'], axis=1)
if export:
self.export_to_csv()
return self.meta_df
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
......@@ -83,6 +88,65 @@ class TsScalingStrategy(Scaling):
super().__init__(with_imputation=False)
class TextPreprocessingStrategy(PreprocessingStrategy):
def __init__(self):
self.stemmer = PorterStemmer()
self.lemmanizer = WordNetLemmatizer()
self.lang = 'english'
self._download_nltk_resources()
def fit(self, data_to_fit):
return self
def apply(self, data):
clean_data = []
for text in data:
words = set(self._word_vectorize(text))
without_stop_words = self._remove_stop_words(words)
words = self._lemmatization(without_stop_words)
words = [word for word in words if word.isalpha()]
new_text = ' '.join(words)
new_text = self._clean_html_text(new_text)
clean_data.append(new_text)
return np.array(clean_data)
@staticmethod
def _download_nltk_resources():
for resource in ['punkt', 'stopwords', 'wordnet']:
try:
nltk.data.find(f'tokenizers/{resource}')
except LookupError:
nltk.download(f'{resource}')
def _word_vectorize(self, text):
words = nltk.word_tokenize(text)
return words
def _remove_stop_words(self, words: set):
stop_words = set(stopwords.words(self.lang))
cleared_words = [word for word in words if word not in stop_words]
return cleared_words
def _stemming(self, words):
stemmed_words = [self.stemmer.stem(word) for word in words]
return stemmed_words
def _lemmatization(self, words):
# TODO pos
lemmas = [self.lemmanizer.lemmatize(word) for word in words]
return lemmas
def _clean_html_text(self, raw_text):
clean_pattern = re.compile('<.*?>')
text = re.sub(clean_pattern, ' ', raw_text)
return text
_preprocessing_for_input_data = {
DataTypesEnum.ts: EmptyStrategy,
DataTypesEnum.table: Scaling,
......
......@@ -17,6 +17,7 @@ from sklearn.linear_model import (Lasso as SklearnLassoReg,
Ridge as SklearnRidgeReg,
SGDRegressor as SklearnSGD)
from sklearn.naive_bayes import BernoulliNB as SklearnBernoulliNB
from sklearn.naive_bayes import MultinomialNB as SklearnMultinomialNB
from sklearn.neighbors import (KNeighborsClassifier as SklearnKNN,
KNeighborsRegressor as SklearnKNNReg)
from sklearn.neural_network import MLPClassifier
......@@ -124,7 +125,8 @@ class SkLearnEvaluationStrategy(EvaluationStrategy):
'svc': CustomSVC,
'svr': SklearnSVR,
'sgdr': SklearnSGD,
'bernb': SklearnBernoulliNB
'bernb': SklearnBernoulliNB,
'multinb': SklearnMultinomialNB,
}
def __init__(self, model_type: str, params: Optional[dict] = None):
......
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from fedot.core.data.data import InputData, OutputData
from fedot.core.models.evaluation.evaluation import EvaluationStrategy
from datetime import timedelta
class VectorizeStrategy(EvaluationStrategy):
__vectorizers_dict = {
'tfidf': TfidfVectorizer,
'cntvect': CountVectorizer,
}
def __init__(self, model_type, params):
self.vectorizer = self.__vectorizers_dict.get(model_type)
super().__init__(model_type, params)
def fit(self, train_data: InputData):
features_list = list(train_data.features)
vectorizer = self.vectorizer().fit(features_list)
return vectorizer
def predict(self, trained_model, predict_data: InputData) -> OutputData:
return trained_model.transform(list(predict_data.features)).toarray()
def fit_tuned(self, train_data: InputData, iterations: int,
max_lead_time: timedelta = timedelta(minutes=5)):
raise NotImplementedError()
......@@ -5,7 +5,6 @@ import numpy as np
from fedot.core.algorithms.time_series.prediction import post_process_forecasted_ts
from fedot.core.data.data import InputData
from fedot.core.log import Log, default_log
from fedot.core.models.evaluation.evaluation import EvaluationStrategy
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.model_types_repository import ModelMetaInfo, ModelTypesRepository
from fedot.core.repository.tasks import Task, TaskTypesEnum, compatible_task_types
......
......@@ -69,6 +69,15 @@
"strategies": ["fedot.core.models.evaluation.keras_eval", "KerasForecastingStrategy"],
"tags": ["nn"],
"description": "Implementations of the keras-based neural models for the time series forecasting"
},
"text_classification": {
"tasks": "[TaskTypesEnum.classification]",
"input_type": "[DataTypesEnum.text]",
"output_type": "[DataTypesEnum.table]",
"forbidden_node_types": ["secondary"],
"strategies": ["fedot.core.models.evaluation.vectorize", "VectorizeStrategy"],
"tags": ["nlp, non-default"],
"description": "Text classification"
}
},
"models": {
......@@ -185,6 +194,14 @@
"lstm": {
"meta": "keras_forecasting",
"tags": ["deep"]
}
},
"multinb": {
"meta": "sklearn_class",
"tags": ["bayesian, non-default"]
},
"tfidf": {
"meta": "text_classification",
"tags": ["text, non-default"]
}
}
}
......@@ -12,3 +12,5 @@ class DataTypesEnum(Enum):
# 2d dataset with lagged features - (n, window_len * features)
ts_lagged_table = 'time_series_lagged_table'
text = 'text'
......@@ -21,3 +21,4 @@ hyperopt==0.2.4
joblib==0.17.0
tensorflow==2.1.0; python_version <= '3.7'
tensorflow==2.3.1; python_version == '3.8'
nltk==3.5
\ No newline at end of file
......@@ -11,7 +11,7 @@ from fedot.core.chains.chain_tune import Tune
from fedot.core.chains.node import PrimaryNode, SecondaryNode
from fedot.core.data.data import InputData, train_test_data_setup
from fedot.core.repository.tasks import Task, TaskTypesEnum
from test.test_chain_import_export import create_four_depth_chain
from test.utilities.test_chain_import_export import create_four_depth_chain
seed(1)
np.random.seed(1)
......
......@@ -4,7 +4,7 @@ from sklearn.datasets import load_iris
from fedot.core.chains.node import PrimaryNode
from fedot.core.data.data import InputData
from fedot.core.data.preprocessing import Normalization
from fedot.core.data.preprocessing import Normalization, TextPreprocessingStrategy
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.tasks import Task, TaskTypesEnum
......@@ -38,3 +38,21 @@ def test_node_with_manual_preprocessing_has_correct_behaviour_and_attributes(dat
assert not np.array_equal(default_node_prediction.predict, manual_node_prediction.predict)
assert node_manual.descriptive_id == '/n_logit_default_params_custom_preprocessing=Normalization'
def test_text_preprocessing_strategy():
test_text = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?',
]
preproc_strategy = TextPreprocessingStrategy()
fit_result = preproc_strategy.fit(test_text)
apply_result = preproc_strategy.apply(test_text)
assert isinstance(fit_result, TextPreprocessingStrategy)
assert apply_result[0] != test_text[0]
from fedot.core.data.data import InputData
from fedot.core.models.evaluation.vectorize import VectorizeStrategy
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.tasks import TaskTypesEnum, Task
from sklearn.feature_extraction.text import TfidfVectorizer
def test_vectorize_tfidf_strategy():
train_text = ['This document first' 'second This document' 'And one third'
'Is document first']
test_text = ['document allow', 'spam not found', 'is are']
train_data = InputData(idx=len(train_text), features=train_text,
target=[0, 0, 1, 0], data_type=DataTypesEnum.text,
task=Task(TaskTypesEnum.classification))
test_data = InputData(idx=len(test_text), features=test_text,
target=[0, 1, 0], data_type=DataTypesEnum.text,
task=Task(TaskTypesEnum.classification))
vectorizer = VectorizeStrategy(model_type='tfidf', params=None)
vectorizer_fitted = vectorizer.fit(train_data)
predicted = vectorizer.predict(trained_model=vectorizer_fitted,
predict_data=test_data)
assert isinstance(vectorizer_fitted, TfidfVectorizer)
assert len(predicted[0]) == 7
import os
import pathlib
import shutil
from os.path import join, abspath, basename
import pytest
from fedot.core.data.load_data import TextBatchLoader
@pytest.fixture(scope="session", autouse=True)
def create_test_data(request):
test_data_dir = 'loader_test_data_dir'
# create files
created_files = []
for dir_index in range(1, 3):
# create subdir
dir_name = abspath(join(test_data_dir, f'subdir{dir_index}'))
pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)
for file_index in range(1, 3):
# create file in subdir
file_name = join(dir_name,
f'{dir_index}_subdir_{file_index}_file.txt')
# fill in file
with open(file_name, 'w') as file:
file.write(f'{basename(file_name)} content')
created_files.append(file_name)
request.addfinalizer(remove_test_data)
def remove_test_data():
test_data_path = 'loader_test_data_dir'
shutil.rmtree(test_data_path)
os.remove('meta_loader_test_data_dir.csv')
def test_text_batch_loader():
path = 'loader_test_data_dir'
test_loader = TextBatchLoader(path)
df = test_loader.extract()
contents = sorted(df['text'].tolist())
assert df.size == 8
assert contents[0] == '1_subdir_1_file.txt content'
......@@ -2,10 +2,10 @@ import os
import pytest
from fedot.core.data.data import InputData, train_test_data_setup
from fedot.core.data.data import train_test_data_setup, InputData
from fedot.core.log import Log, LogManager, default_log
from fedot.core.models.model import Model
from test.test_chain_import_export import create_four_depth_chain
from test.utilities.test_chain_import_export import create_four_depth_chain
@pytest.fixture()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment