Commit 60b58ab3 authored by Alexey Lapin's avatar Alexey Lapin
Browse files

Added generation of column descriptions, updated project structure, slightly updated architecture

Showing with 741 additions and 347 deletions
+741 -347
File moved
from fedotllm.data import Dataset
from fedotllm.web_api import WebAssistant
from fedotllm.actions import ModelAction
from pprint import pprint
def main():
LLAMA8B = "http://10.32.2.2:8672/v1/chat/completions"
dataset = Dataset.load_from_path('datasets/titanic')
train = dataset.splits[0]
model = WebAssistant(LLAMA8B, model_type='8b')
action = ModelAction(model=model)
column_descriptions = action.generate_all_column_description(split=train, dataset=dataset)
train.set_column_descriptions(column_descriptions)
pprint(train.get_column_descriptions())
if __name__ == '__main__':
main()
This diff is collapsed.
File moved
import os
import json
from tenacity import retry, stop_after_attempt, wait_random_exponential, retry_if_exception_type
from requests.exceptions import RequestException
import pandas as pd
from fedotllm.data import Dataset, Split
import random
import re
import ast
from tenacity import retry, stop_after_attempt
from typing import Dict
_MAX_RETRIES = 6
class ModelAction():
def __init__(self, model) -> None:
self.model = model
def run_model_multicall(self, tokenizer, generation_config, prompts):
"""Run all prompts on local model
"""
responses = {}
for task in prompts:
messages = [
{"role": "system", "content": prompts[task]["system"]},
{"role": "context", "content": prompts[task]["context"]},
{"role": "user", "content": prompts[task]["task"]},
]
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(self.model.device)
outputs = self.model.generate(
input_ids,
**generation_config
)
response = outputs[0][input_ids.shape[-1]:]
responses[task] = tokenizer.decode(
response, skip_special_tokens=True)
return responses
@retry(
wait=wait_random_exponential(min=1, max=60),
stop=stop_after_attempt(_MAX_RETRIES),
retry=(retry_if_exception_type(RequestException)
| retry_if_exception_type(RuntimeError)),
reraise=True
)
def run_web_model_multicall(self, prompts):
"""Run all prompts on web model
"""
responses = {}
for task in prompts:
self.model.set_sys_prompt(prompts[task]["system"])
self.model.add_context(prompts[task]["context"])
response = self.model(prompts[task]["task"], as_json=True)
responses[task] = response
return responses
@retry(
stop=stop_after_attempt(_MAX_RETRIES),
reraise=True
)
def __generate_column_description(self, column: pd.Series, split: Split, dataset: Dataset):
"""
Generate a description for a column.
Args:
- column (pd.Series): The column for which the description is generated.
- dataset (Dataset): The dataset containing the column.
Returns:
dict: A dictionary containing the generated description for the column.
Raises:
RuntimeError: If the answer is not found in the response or if the 'data' node is not found in the response.
"""
FIND_ANSWER = re.compile(
r"\{['\"]data['\"]\s*:\s*\{['\"]type['\"]\s*:\s*['\"]string['\"]\s*\,\s*['\"]description['\"]\s*:\s*['\"].*['\"]\}\}")
user_template = """Dataset Title: {title}
Dataset description: {ds_descr}
Column name: {col_name}
Column hint: {hint}
Column values:
```
{values}
```
"""
self.model.set_context(column.head(30).to_markdown(index=False))
column_uniq_vals = column.unique().tolist()
column_vals = pd.Series(column_uniq_vals if len(
column_uniq_vals) < 30 else random.sample(column_uniq_vals, k=30), name=column.name)
user_prompt = user_template.format(
title=dataset.name,
ds_descr=dataset.description,
col_name=column.name,
hint=split.get_column_hint(column.name),
values=column_vals.to_markdown(index=False)
)
response = self.model(user_prompt, as_json=True)
response = response.strip().replace('\n', '').capitalize()
answer = re.findall(FIND_ANSWER, response)
if not answer:
raise RuntimeError("Answer not found in: ", response)
dict_resp = ast.literal_eval(answer[0])
if "data" not in dict_resp:
raise RuntimeError("Data node not found in: ", response)
return dict_resp
def generate_all_column_description(self, split: Split, dataset: Dataset) -> Dict[str, str]:
""" Generate descriptions for all columns in the provided table.
Args:
split (pd.DataFrame): A split representing the table with columns to describe.
dataset (Dataset): The dataset used for generating column descriptions.
Returns:
A dictionary where keys are column names and values are descriptions generated for each column.
"""
schema = {
"data": {
"type": "string",
"description": "one line plain text"
}
}
sys_prompt = """You are helpful AI assistant.
User will enter one column from dataset, and the assistant will make one sentence discription of data in this column.
Don't make assumptions about what values to plug into functions. Use column hint.
Output format: only JSON using the schema defined here: {schema}""".format(schema=json.dumps(schema))
self.model.set_sys_prompt(sys_prompt)
result = {}
for col_name in split.data.columns:
result[col_name] = self.__generate_column_description(column=split.data[col_name],
split=split,
dataset=dataset)['data']['description']
return result
def process_model_responses(responses):
responses["categorical_columns"] = responses["categorical_columns"].split(
"\n")
responses["task_type"] = responses["task_type"].lower()
return responses
def save_model_responses(responses, path):
with open(os.sep.join([path, 'model_responses.json']), 'w') as json_file:
json.dump(responses, json_file)
import json
import os
import requests
import arff
import pandas as pd
from typing import Any
from typing import Any, Dict
class Split:
"""
Split within dataset object
"""
def __init__(self, name, data, path, description) -> None:
def __init__(self, name: str, data: pd.DataFrame, path: str, description: str, columns: dict | None) -> None:
"""
Initialize an instanse of a Split.
"""
......@@ -19,44 +20,77 @@ class Split:
self.data = data
self.path = path
self.description = description
# init columns metadata info
self.columns_meta = {col: {} for col in data.columns}
if columns is not None:
for col, metainfo in columns.items():
if col not in self.columns_meta:
raise RuntimeError("Failed to find the column {col} defined in the metadata.json file")
if 'hint' in metainfo:
self.columns_meta[col]['hint'] = metainfo['hint']
if 'description' in metainfo:
self.columns_meta[col]['description'] = metainfo['description']
def get_description(self):
return f"The {self.name} split contains following columns: {self.data.columns}. It is described as {self.description}"
def get_text_columns(self):
return list(self.data.select_dtypes(include=['object']).columns)
def get_numeric_columns(self):
return list(self.data.select_dtypes(include=['number']).columns)
def get_unique_counts(self):
return self.data.apply(lambda col: col.nunique())
def get_unique_ratios(self):
return self.data.apply(lambda col: col.nunique() / len(col))
return self.data.apply(lambda col: round(col.nunique() / len(col.dropna()), 2))
def get_column_types(self):
return self.data.apply(lambda col: "string" if col.name in self.get_text_columns() else "numeric")
def get_head_by_column(self, column_name, count = 10):
def get_head_by_column(self, column_name, count=10):
return list(self.data[column_name].head(count))
def get_column_descriptions(self):
unique_counts = self.get_unique_counts()
unique_ratios = self.get_unique_ratios()
column_types = self.get_column_types()
column_descriptions = [f"{column_name}: {column_types[column_name]}"
f"{100 * unique_ratios[column_name]:.2f}% unique values, examples: {self.get_head_by_column(column_name)}"
for column_name in self.data.columns]
return column_descriptions
return dict((key, value['description']) for key, value in self.columns_meta.items())
def get_column_hint(self, column_name: str) -> None | str:
"""
Get the hint associated with a specific column.
Args:
column_name (str): The name of the column.
Returns:
str or None: The hint associated with the column, or None if not found.
"""
return self.columns_meta.get(column_name, None).get('hint', None)
def set_column_descriptions(self, column_description: Dict[str, str]) -> None:
"""
Set descriptions for columns in the metadata.
Args:
column_description (Dict[str, str]): A dictionary where keys are column names and values are descriptions.
Returns:
None
"""
for key, value in column_description.items():
if key in self.columns_meta:
self.columns_meta[key]['description'] = value
class Dataset:
"""
Dataset object that represents an ML task and may contain multiple splits
"""
def __init__(self, name, description, goal, splits, train_split_name) -> None:
def __init__(self, name, description, goal, splits) -> None:
"""
Initialize an instance of a Dataset.
"""
......@@ -64,7 +98,7 @@ class Dataset:
self.description = description
self.goal = goal
self.splits = splits
self.train_split_name = train_split_name
@classmethod
def load_from_path(cls, path):
......@@ -75,42 +109,39 @@ class Dataset:
"""
with open(os.sep.join([path, 'metadata.json']), 'r') as json_file:
dataset_metadata = json.load(json_file)
#load each split file
# load each split file
splits = []
for split_name in dataset_metadata['split_names']:
split_path = os.sep.join([path, dataset_metadata["split_paths"][split_name]])
split_description = dataset_metadata["split_descriptions"][split_name]
for split in dataset_metadata['splits']:
split_name = split['name']
split_path = os.sep.join([path, split["path"]])
split_description = split.get("description", '')
split_columns = split.get('columns', None)
if split_path.split(".")[-1] == "csv":
data = pd.read_csv(split_path)
split = Split(data = data,
name = split_name,
path = split_path,
description = split_description)
split = Split(data=data,
name=split_name,
path=split_path,
description=split_description,
columns=split_columns)
splits.append(split)
elif split_path.split(".")[-1] == "arff":
data = arff.loadarff(split_path)
split = Split(data = pd.DataFrame(data[0]),
name = split_name,
path = split_path,
description = split_description)
data = pd.DataFrame(arff.loadarff(split_path)[0])
split = Split(data=data,
name=split_name,
path=split_path,
description=split_description,
columns=split_columns)
splits.append(split)
else:
print(f"split {split_path}: unsupported format")
#if we have model responses saved already = obasolete for now
# if os.path.exists(os.sep.join([path,'model_responses.json'])):
# with open(os.sep.join([path, 'model_responses.json']), 'r') as json_file:
# model_responses = json.load(json_file)
# dataset_metadata.update(model_responses)
return cls(
name = dataset_metadata["name"],
description = dataset_metadata["description"],
goal = dataset_metadata["goal"],
splits = splits,
train_split_name = dataset_metadata["train_split_name"],
)
name=dataset_metadata["name"],
description=dataset_metadata["description"],
goal=dataset_metadata["goal"],
splits=splits,
)
def get_description(self):
train_split = next(split for split in self.splits if split.name == self.train_split_name)
......@@ -118,18 +149,19 @@ class Dataset:
column_descriptions = train_split.get_column_descriptions()
introduction_lines = [
f"Assume we have a dataset called '{self.name}', which describes {self.description}, and the task is to {self.goal}.",
f""
] + split_description_lines + [
f"Below is the type (numeric or string), unique value count and ratio for each column, and few examples of values:",
f""
] + column_descriptions + [
f"",
]
f"Assume we have a dataset called '{self.name}', which describes {self.description}, and the task is to {self.goal}.",
f""
] + split_description_lines + [
f"Below is the type (numeric or string), unique value count and ratio for each "
f"column, and few examples of values:",
f""
] + column_descriptions + [
f"",
]
return "\n".join(introduction_lines)
def get_metadata_description(self):
splits_names = [split.name for split in self.splits ]
splits_names = [split.name for split in self.splits]
description = f"name: {self.name} \ndescription: {self.description} \ntrain_split_name: {self.train_split_name} \nsplits: {splits_names}"
return description
\ No newline at end of file
return description
File moved
from typing import Literal, Mapping, Tuple, Union
import pandas as pd
def format_dataframes(data: Union[pd.DataFrame, Mapping[str, pd.DataFrame]]) -> str:
"""
Formats the input dataframe or dataframes into markdown format.
Args:
data (pd.DataFrame, Mapping[str, pd.DataFrame]): Input data to be formatted.
Returns:
str: Formatted data in markdown format.
"""
outer = "\n\n```\n{}\n```\n\n"
if isinstance(data, pd.DataFrame):
inner = f"{data.to_markdown(index=False)}"
else:
inner = "\n\n".join(
f"### {key}\n{df.to_markdown(index=False)}"
for key, df in data.items()
)
return outer.format(inner)
def filter_entities(panel_df: pd.DataFrame, basket: list):
entity_col = panel_df.columns[0]
# Filter rows where the value in the entity column is in the basket
df = panel_df[panel_df[entity_col].isin(basket)]
if df.empty:
raise ValueError(f"No matching entities found in panel given basket: {basket}")
return df
File moved
......@@ -29,8 +29,8 @@ class WebAssistant:
"""
self._system_prompt = new_prompt
def add_context(self, context: str) -> None:
"""Add a context to model's prompt
def set_context(self, context: str) -> None:
"""Set a context to model's prompt
Args:
context (str): context related to question.
......@@ -97,10 +97,11 @@ class WebAssistant:
else:
raise NotImplementedError("Model type not supported")
response = requests.post(url=self._url, json=formatted_prompt, timeout=timeout)
if response.status_code != requests.codes.ok:
raise RuntimeError("Error while communicating with the model")
try:
response = requests.post(url=self._url, json=formatted_prompt, timeout=timeout)
response.raise_for_status()
except requests.exceptions.HTTPError as err:
raise RuntimeError(err)
if kwargs.get('as_json'):
try:
......
File moved
%% Cell type:markdown id:5e96ab50-b396-4e48-81cc-160058738c4e tags:
# Setup
%% Cell type:code id:5ac5964c-acfa-4912-92af-7946c6fa7398 tags:
```
import os
import numpy as np
import pandas as pd
from pprint import pprint
from zip import unzip_archive
from fedot_util import run_example
from llm_util import run_web_model_multicall
from web_api import WebAssistant
from data import Dataset
import prompts
```
%% Cell type:markdown id:1f2ac346 tags:
# Загрузка данных
%% Cell type:code id:2a1cabc3 tags:
```
dataset_path = [
'titanic',
'credit-g'
][0]
dataset_path = os.sep.join(['datasets', dataset_path])
# zip_filename = f"{dataset_path}.zip"
# os.makedirs(dataset_path, exist_ok=True)
# unzip_archive(zip_filename, dataset_path)
```
%% Cell type:code id:O3IlOI4PeVXT tags:
```
dataset = Dataset.load_from_path(dataset_path)
dataset_description = dataset.get_description()
dataset_metadata_description = dataset.get_metadata_description()
print(dataset_metadata_description)
```
%%%% Output: stream
name: titanic passengers survival
description: passengers survived the Titanic shipwreck
train_split_name: train
splits: dict_keys(['train', 'test_X', 'test_y'])
%% Cell type:markdown id:b667d022 tags:
# Запрос к web-модели
%% Cell type:code id:06ec2375 tags:
```
task_prompts = {
"categorical_columns": {
"system": dataset_description,
"task": prompts.categorical_definition_prompt,
"context": prompts.categorical_definition_context,
},
"target_column": {
"system": dataset_description,
"task": prompts.target_definition_prompt,
"context": None,
},
"task_type": {
"system": dataset_description,
"task": prompts.task_definition_prompt,
"context": None,
}
}
#Выбор модели
model_type = ["8b", "70b"][0]
url = "http://10.32.2.2:8672/v1/chat/completions"
if model_type == "70b":
url = "http://10.32.15.21:6672/generate"
model = WebAssistant(url)
responses = run_web_model_multicall(
model, task_prompts
)
pprint(responses)
```
%%%% Output: error
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
Cell In[6], line 1
----> 1 from prompts import categorical_definition_prompt, target_definition_prompt, task_definition_prompts
2 task_prompts = {
3 "categorical_columns": categorical_definition_prompt,
4 "target_column": target_definition_prompt,
5 "task_type": task_definition_prompts,
6 }
8 url = "http://10.32.15.21:6672/generate"
ImportError: cannot import name 'task_definition_prompts' from 'prompts' (c:\Users\Stas\Documents\python\AutoML-LLM\prompts.py)
%% Cell type:markdown id:8382401d tags:
# Запуск фреймворка
%% Cell type:code id:aaJnOwCvr_wR tags:
```
if dataset_path == 'titanic':
test_df = dataset_metadata["splits"]["test_X"].merge(dataset_metadata["splits"]["test_y"],
on='PassengerId', how='inner')
else:
test_df = dataset_metadata["splits"]["test"]
train_df = dataset_metadata["splits"]["train"]
prediction = run_example(train_df = train_df, test_df = test_df,
dataset_metadata = dataset_metadata)
```
%%%% Output: stream
Generations: 0%| | 0/10000 [02:16<?, ?gen/s]
%%%% Output: stream
{'roc_auc': 0.941, 'accuracy': 0.828}
%% Cell type:code id:2f61d093 tags:
```
prediction[:5]
```
%%%% Output: execute_result
array([[0],
[0],
[0],
[0],
[1]], dtype=int64)
%% Cell type:code id:2e45d4f3 tags:
```
result_df = pd.DataFrame(prediction, columns=[dataset_metadata["target_column"]])
result_df.to_csv(f"{dataset_path}/predictions.csv")
```
import os
import json
from tenacity import retry, stop_after_attempt, wait_random_exponential, retry_if_exception_type
from requests.exceptions import RequestException
_MAX_RETRIES = 6
def run_model_multicall(model, tokenizer, generation_config, prompts):
"""Run all prompts on local model
"""
responses = {}
for task in prompts:
messages = [
{"role": "system", "content": prompts[task]["system"]},
{"role": "context", "content": prompts[task]["context"]},
{"role": "user", "content": prompts[task]["task"]},
]
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(model.device)
outputs = model.generate(
input_ids,
**generation_config
)
response = outputs[0][input_ids.shape[-1]:]
responses[task] = tokenizer.decode(response, skip_special_tokens=True)
return responses
@retry(
wait=wait_random_exponential(min=1, max=60),
stop=stop_after_attempt(_MAX_RETRIES),
retry=(retry_if_exception_type(RequestException) | retry_if_exception_type(RuntimeError)),
reraise=True
)
def run_web_model_multicall(model, prompts):
"""Run all prompts on web model
"""
responses = {}
for task in prompts:
model.set_sys_prompt(prompts[task]["system"])
model.add_context(prompts[task]["context"])
response = model(prompts[task]["task"], as_json=True)
responses[task] = response
return responses
def process_model_responses(responses):
responses["categorical_columns"] = responses["categorical_columns"].split("\n")
responses["task_type"] = responses["task_type"].lower()
return responses
def save_model_responses(responses, path):
with open(os.sep.join([path, 'model_responses.json']), 'w') as json_file:
json.dump(responses, json_file)
\ No newline at end of file
......@@ -22,6 +22,7 @@ cloudpickle==3.0.0
colorlog==6.8.2
comm==0.2.2
contourpy==1.2.1
cramjam==2.8.3
cycler==0.12.1
debugpy==1.8.2
decorator==5.1.1
......@@ -34,18 +35,21 @@ ete3==3.1.3
exceptiongroup==1.2.2
executing==2.0.1
fastjsonschema==2.20.0
fastparquet==2024.5.0
fedot==0.7.3.2
filelock==3.15.4
FLAML==2.1.2
fonttools==4.53.1
fqdn==1.5.1
fsspec==2024.6.1
func_timeout==4.3.5
func-timeout==4.3.5
functime==0.9.5
future==1.0.0
graphviz==0.20.3
greenlet==3.0.3
h11==0.14.0
holidays==0.53
httpcore==1.0.5
httpx==0.27.0
huggingface-hub==0.23.5
hyperopt==0.2.7
idna==3.7
imageio==2.34.2
......@@ -72,7 +76,7 @@ jupyter_client==8.6.2
jupyter_core==5.7.2
jupyter_server==2.14.2
jupyter_server_terminals==0.5.3
jupyterlab==4.2.3
jupyterlab==4.2.4
jupyterlab_pygments==0.3.0
jupyterlab_server==2.27.3
jupyterlab_widgets==3.0.11
......@@ -85,7 +89,6 @@ MarkupSafe==2.1.5
matplotlib==3.9.1
matplotlib-inline==0.1.7
mistune==3.0.2
mpmath==1.3.0
multiprocess==0.70.16
nbclient==0.10.0
nbconvert==7.16.4
......@@ -108,13 +111,15 @@ pillow==10.4.0
platformdirs==4.2.2
plotly==5.22.0
pluggy==1.5.0
polars==1.2.0
prometheus_client==0.20.0
prompt_toolkit==3.0.47
psutil==6.0.0
ptyprocess==0.7.0
pure-eval==0.2.2
py4j==0.10.9.7
pyaml==24.4.0
pyaml==24.7.0
pyarrow==17.0.0
pycparser==2.22
Pygments==2.18.0
pyparsing==3.1.2
......@@ -129,12 +134,10 @@ qtconsole==5.5.2
QtPy==2.4.1
readthedocs-sphinx-search==0.3.2
referencing==0.35.1
regex==2024.5.15
requests==2.32.3
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rpds-py==0.19.0
safetensors==0.4.3
SALib==1.4.8
scikit-learn==1.2.2
scikit-optimize==0.10.2
......@@ -146,7 +149,7 @@ sktime==0.16.1
sniffio==1.3.1
snowballstemmer==2.2.0
soupsieve==2.5
Sphinx==7.4.5
Sphinx==7.4.6
sphinx-rtd-theme==2.0.0
sphinxcontrib-applehelp==1.0.8
sphinxcontrib-details-directive==0.1.0
......@@ -159,23 +162,21 @@ sphinxcontrib-serializinghtml==1.1.10
SQLAlchemy==2.0.31
stack-data==0.6.3
statsmodels==0.14.2
sympy==1.13.0
tabulate==0.9.0
tenacity==8.5.0
terminado==0.18.1
testfixtures==8.3.0
thegolem==0.4.0
threadpoolctl==3.5.0
tinycss2==1.3.0
tokenizers==0.19.1
tomli==2.0.1
torch==2.3.1
tornado==6.4.1
tqdm==4.65.2
traitlets==5.14.3
transformers==4.42.4
types-python-dateutil==2.9.0.20240316
typing==3.7.4.3
typing_extensions==4.12.2
tzdata==2024.1
uri-template==1.3.0
urllib3==2.2.2
wcwidth==0.2.13
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment