Python API Referenz
Die Python API Referenz dient als zentrale Anlaufstelle für Entwicklerinnen und Entwickler. Sie hilft dabei, Funktionen, Klassen, Module nachvollziehen und effizient nutzen zu können. Sie bietet eine strukturierte Übersicht über alle öffentlich zugänglichen Elemente des Codes und deren Verwendung.
In dieser Referenz finden Sie detaillierte Informationen zu:
-
Modulen und Paketen: Welche Module verfügbar sind und wie sie importiert werden.
-
Funktionen und Methoden: Beschreibung von Parametern, Rückgabewerten, möglichen Ausnahmen und Anwendungsbeispielen.
-
Klassen und Objekten: Informationen zu Konstruktoren, Attributen, geerbten Methoden.
Diese Dokumentation richtet sich sowohl an Einsteigerinnen und Einsteiger als auch an erfahrene Entwicklerinnen und Entwickler. Sie soll den Einstieg erleichtern, die Entwicklung beschleunigen und die Wiederverwendbarkeit des Codes fördern.
evaluation
| MODULE | DESCRIPTION |
|---|---|
main |
Main script with the command line interface. |
src |
Main module for the evaluation code. |
main
Main script with the command line interface.
| FUNCTION | DESCRIPTION |
|---|---|
main |
Run the command line interface. |
main
Run the command line interface.
Source code in docs/repositories-clones/evaluation/main.py
def main() -> None:
"""Run the command line interface."""
parser = argparse.ArgumentParser(description="")
subparsers = parser.add_subparsers(help="subcommand help", dest="subparser_name")
parser_evaluate = subparsers.add_parser("evaluate", help="Run and evaluate an experiment.")
parser_evaluate.add_argument(
"--only-transform",
action="store_true",
help="Run only the transformation, without evaluation and summary.",
)
_ = subparsers.add_parser("create-dataset", help="Create a synthetic dataset.")
combine_experiments_parser = subparsers.add_parser(
"combine-experiments", help="Create a summary for different experiments."
)
combine_experiments_parser.add_argument(
"paths",
nargs="+",
help="The paths to the experiment folders to combine.",
type=Path,
)
# extra logic to have evaluate as the default subcommand
# first parse known args to extract subparser_name
args, extras = parser.parse_known_args()
if args.subparser_name is None:
# if no subparser name is given, use the evaluate subparser
args = parser_evaluate.parse_args(extras)
args.subparser_name = "evaluate"
else:
# if a subparser name is given, parser the args regularly
args = parser.parse_args()
if args.subparser_name == "evaluate":
results_folder = Path(f"results/{settings.experiment_name}__{settings.experiment_suffix}")
results_folder.mkdir(parents=True, exist_ok=True)
# copy the configuration
copy_config_dir = results_folder / "config"
# copy files (excluding hidden files)
shutil.copytree(
Path("config"),
copy_config_dir,
dirs_exist_ok=True,
ignore=shutil.ignore_patterns(r".*"),
)
# write commit hash to copied config directory
try:
commit_hash = (
subprocess.check_output(["/usr/bin/git", "rev-parse", "HEAD"]) # noqa: S603 (command is static)
.decode("utf-8")
.strip()
)
with open(copy_config_dir / "commit_hash.txt", "w", encoding="utf-8") as f:
f.write(f"{commit_hash}\n")
except (subprocess.CalledProcessError, FileNotFoundError) as e:
print("Could not store commit hash:", e)
print("Running transformation...")
transform(results_folder=results_folder)
if not args.only_transform:
print("Running evaluation...")
evaluate(results_folder=results_folder)
print("Running summarize...")
summarize(input_folders=[results_folder], output_folder=results_folder)
print("Done.")
elif args.subparser_name == "create-dataset":
create_dataset()
elif args.subparser_name == "combine-experiments":
output_folder = Path("results") / "combined_experiments" / settings.experiment_suffix
output_folder.mkdir(parents=True, exist_ok=True)
summarize(input_folders=args.paths, output_folder=output_folder)
src
Main module for the evaluation code.
| MODULE | DESCRIPTION |
|---|---|
create_dataset |
Generate a synthetic dataset. |
evaluate |
Main script to evaluate transformed texts. |
indices |
Functions to calculate additional indices. |
models |
Pydantic models. |
openai_evaluator |
Class for text evaluation using OpenAI. |
request_handling |
Utility function for making robust synchronous HTTP POST requests with retries and error handling. |
settings |
Loads the configuration and makes it accessible for other modules. |
summarize |
Script for extended and summarized evaluation. |
transform |
Script for LLM-Evaluation. |
utils |
Utility functions. |
create_dataset
Generate a synthetic dataset.
| FUNCTION | DESCRIPTION |
|---|---|
create_dataset |
Create a synthetic dataset. |
format_user_prompt |
Format the user prompt string with values from the configuration. |
generate |
Perform a chat completion using a system prompt and a user prompt. |
create_dataset
Create a synthetic dataset.
The parameters are provided by configuration files.
Source code in docs/repositories-clones/evaluation/src/create_dataset.py
def create_dataset() -> None:
"""Create a synthetic dataset.
The parameters are provided by configuration files.
"""
data_dir = Path("data")
config = DatasetCreationSettings()
user_prompt_format = format_user_prompt(config=config)
print("=== User Prompt ===")
print(user_prompt_format)
print("===================")
llm_provider = OpenAI(
api_key=config.llm_config.api.auth.secret.get_secret_value(),
base_url=str(config.llm_config.api.url),
)
dataset = generate(
llm_provider=llm_provider,
system_prompt=config.system_prompt,
user_prompt=user_prompt_format,
llm_config=config.llm_config,
)
print("=== LLM Output ===")
print(dataset)
print("==================")
df = pd.DataFrame(dataset.examples, columns=[settings.input_column_name])
if config.output_format == "csv":
output_path = data_dir / f"{config.dataset_name}.csv"
save_dataframe(df, output_path)
elif config.output_format == "xlsx":
output_path = data_dir / f"{config.dataset_name}.xlsx"
df.to_excel(output_path, index=False)
print(f"Datei '{output_path}' mit {len(df)} Beispielen erstellt.")
format_user_prompt
Format the user prompt string with values from the configuration.
| PARAMETER | DESCRIPTION |
|---|---|
config
|
Configuration dictionary that must contain the keys: - "user" (str): Prompt template with placeholders. - "num_examples" (int or str): Number of examples to generate. - "description" (str): Description of the dataset/task. - "criteria" (str): Criteria or constraints for the examples.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
str
|
The formatted user prompt.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/create_dataset.py
def format_user_prompt(config: DatasetCreationSettings) -> str:
"""Format the user prompt string with values from the configuration.
Args:
config (DatasetCreationSettings): Configuration dictionary that must contain the keys:
- "user" (str): Prompt template with placeholders.
- "num_examples" (int or str): Number of examples to generate.
- "description" (str): Description of the dataset/task.
- "criteria" (str): Criteria or constraints for the examples.
Returns:
str: The formatted user prompt.
"""
return config.user_prompt.format(
num_examples=config.num_examples,
description=config.description,
criteria=config.criteria,
)
generate
Perform a chat completion using a system prompt and a user prompt.
| PARAMETER | DESCRIPTION |
|---|---|
llm_provider
|
An OpenAI client or a compatible LLM provider.
TYPE:
|
system_prompt
|
The system-level prompt that defines the model's behavior.
TYPE:
|
user_prompt
|
The user input prompt.
TYPE:
|
llm_config
|
Model configuration parameters. Must include the attribute
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
Dataset
|
A
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/create_dataset.py
def generate(
llm_provider: OpenAI, system_prompt: str, user_prompt: str, llm_config: LLMConfig
) -> Dataset:
"""Perform a chat completion using a system prompt and a user prompt.
Args:
llm_provider (OpenAI):
An OpenAI client or a compatible LLM provider.
system_prompt (str):
The system-level prompt that defines the model's behavior.
user_prompt (str):
The user input prompt.
llm_config (LLMConfig):
Model configuration parameters. Must include the attribute `"label"`
specifying the model name or identifier.
Returns:
Dataset:
A `Dataset` object containing the generated examples.
"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
response = llm_provider.responses.parse(
model=llm_config.label,
input=messages,
text_format=Dataset,
)
content = response.output_parsed
return content
evaluate
Main script to evaluate transformed texts.
| FUNCTION | DESCRIPTION |
|---|---|
evaluate |
Main function to run the evaluation. |
evaluate_folder |
Runs the evaluation for the data in the results_folder. |
run_index_evaluation |
Calculates indices for the texts in a DataFrame. |
run_llm_criteria_evaluation |
Performs an LLM-based criteria evaluation on the transformed texts in a DataFrame. |
evaluate
Main function to run the evaluation.
The parameters are provided by configuration files.
| PARAMETER | DESCRIPTION |
|---|---|
results_folder
|
The path to the folder were the results of this run are stored.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/evaluate.py
def evaluate(results_folder: Path) -> None:
"""Main function to run the evaluation.
The parameters are provided by configuration files.
Args:
results_folder (Path): The path to the folder were the results of this run
are stored.
"""
# filter selected evaluation tasks
tasks = {key: settings.llm_tasks[key] for key in settings.tasks}
# setup LLM provider for evaluation
evaluation_llm_provider = EvaluationProviderOpenAiLike(settings.llm_config.evaluation)
# folder to store the evaluation data
results_folder.mkdir(parents=True, exist_ok=True)
evaluation_dirs = []
for directory, _, files in results_folder.walk():
if settings.transformed_data_filename in files:
evaluation_dirs.append(directory)
evaluation_dirs = sorted(evaluation_dirs, key=str)
for directory in evaluation_dirs:
evaluate_folder(
results_folder=directory,
evaluation_llm_provider=evaluation_llm_provider,
tasks=tasks,
original_column=settings.input_column_name,
transformed_column=settings.output_column_name,
)
evaluate_folder
evaluate_folder(*, results_folder, evaluation_llm_provider, tasks, original_column, transformed_column)
Runs the evaluation for the data in the results_folder.
| PARAMETER | DESCRIPTION |
|---|---|
results_folder
|
Path to the folder where data and results are stored.
TYPE:
|
evaluation_llm_provider
|
An object that provides
the
TYPE:
|
tasks
|
A dictionary where the keys are task identifiers, and the values are task descriptions used for the task evaluation.
TYPE:
|
original_column
|
Name of the column containing the original text.
TYPE:
|
transformed_column
|
Name of the column containing the transformed text.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
None
|
None |
Source code in docs/repositories-clones/evaluation/src/evaluate.py
def evaluate_folder(
*,
results_folder: Path,
evaluation_llm_provider: EvaluationProviderOpenAiLike,
tasks: dict,
original_column: str,
transformed_column: str,
) -> None:
"""Runs the evaluation for the data in the results_folder.
Args:
results_folder (Path): Path to the folder where data and results are stored.
evaluation_llm_provider (EvaluationProviderOpenAiLike): An object that provides
the `evaluate` method to perform the task comparison evaluation.
tasks (dict): A dictionary where the keys are task identifiers,
and the values are task descriptions used for the task evaluation.
original_column (str): Name of the column containing the original text.
transformed_column (str): Name of the column containing the transformed text.
Returns:
None
"""
print(f"Evaluate transformed text in {results_folder}\n")
# load dataframe with original and transformed text
evaluation_df = pd.read_csv(
results_folder / settings.transformed_data_filename,
delimiter=settings.csv_separator,
)
# run LLM-based evaluation and compute indices
run_llm_criteria_evaluation(
evaluation_df=evaluation_df,
openai_provider=evaluation_llm_provider,
tasks=tasks,
transformed_column=transformed_column,
original_column=original_column,
)
run_index_evaluation(
evaluation_df=evaluation_df,
transformed_column=transformed_column,
original_column=original_column,
)
if len(settings.score_weighting) > 0:
# calculate score if a score weighting is specified
calculate_weighted_average(evaluation_df, settings.score_weighting, result_column="Score")
evaluation_filepath = results_folder / settings.evaluation_output_filename
print(f"Saving evaluation data for '{results_folder}' to '{evaluation_filepath}'")
save_dataframe(df=evaluation_df, filepath=evaluation_filepath)
print("")
run_index_evaluation
Calculates indices for the texts in a DataFrame.
| PARAMETER | DESCRIPTION |
|---|---|
evaluation_df
|
The DataFrame containing the texts to be evaluated.
TYPE:
|
transformed_column
|
Name of the column containing the transformed text.
TYPE:
|
original_column
|
Name of the column containing the original text.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
None
|
The function modifies the
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/evaluate.py
def run_index_evaluation(
evaluation_df: pd.DataFrame,
transformed_column: str,
original_column: str,
) -> None:
"""Calculates indices for the texts in a DataFrame.
Args:
evaluation_df (pd.DataFrame): The DataFrame containing the texts to be evaluated.
transformed_column (str): Name of the column containing the transformed text.
original_column (str): Name of the column containing the original text.
Returns:
None: The function modifies the `evaluation_df` DataFrame in-place,
adding new columns for the results.
"""
# using partial for closure in loop, see
# https://docs.astral.sh/ruff/rules/function-uses-loop-variable/
def _wrapper(row: pd.Series, index_fn: Callable) -> bool | int | float | None:
return index_fn(
original_text=row[original_column],
transformed_text=row[transformed_column],
)
wrapped_selected_indices = {}
for index_name in settings.indices:
if index_name not in INDEX_FUNCTIONS:
raise ValueError(
f"There is no function associated with the supplied index name '{index_name}'. "
"Please check the 'indices' key of your configuration."
)
wrapped_selected_indices[index_name] = partial(
_wrapper,
index_fn=INDEX_FUNCTIONS[index_name],
)
for index_name, wrapped_index_fn in wrapped_selected_indices.items():
print(f'Calculating index "{index_name}"...')
index_result = evaluation_df.progress_apply(wrapped_index_fn, axis=1)
evaluation_df[index_name] = index_result
run_llm_criteria_evaluation
run_llm_criteria_evaluation(evaluation_df, openai_provider, tasks, transformed_column, original_column, output_column_prefix='')
Performs an LLM-based criteria evaluation on the transformed texts in a DataFrame.
The function compares the texts in the transformed_column of the DataFrame with the original texts
in the original_column based on specific tasks provided in the tasks dictionary.
Each task in the dictionary is evaluated, and the results are stored in-place in new columns named
'{output_column_prefix}{task_key}' in the DataFrame. The evaluation results are then converted
into boolean values indicating whether the task comparison criteria are met.
| PARAMETER | DESCRIPTION |
|---|---|
evaluation_df
|
The DataFrame containing the texts to be evaluated.
TYPE:
|
openai_provider
|
An object that provides the
TYPE:
|
tasks
|
A dictionary where the keys are task identifiers, and the values are task descriptions used for the task comparison.
TYPE:
|
transformed_column
|
Name of the column containing the transformed text.
TYPE:
|
original_column
|
Name of the column containing the original text.
TYPE:
|
output_column_prefix
|
Prefix for the column names of the results.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
None
|
The function modifies the
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/evaluate.py
def run_llm_criteria_evaluation(
evaluation_df: pd.DataFrame,
openai_provider: EvaluationProviderOpenAiLike,
tasks: dict[str, str],
transformed_column: str,
original_column: str,
output_column_prefix: str = "",
) -> None:
"""Performs an LLM-based criteria evaluation on the transformed texts in a DataFrame.
The function compares the texts in the transformed_column of the DataFrame with the original texts
in the original_column based on specific tasks provided in the `tasks` dictionary.
Each task in the dictionary is evaluated, and the results are stored in-place in new columns named
'{output_column_prefix}{task_key}' in the DataFrame. The evaluation results are then converted
into boolean values indicating whether the task comparison criteria are met.
Args:
evaluation_df (pd.DataFrame): The DataFrame containing the texts to be evaluated.
openai_provider (EvaluationProviderOpenAiLike): An object that provides the `evaluate`
method to perform the task comparison evaluation.
tasks (dict): A dictionary where the keys are task identifiers,
and the values are task descriptions used for the task comparison.
transformed_column (str): Name of the column containing the transformed text.
original_column (str): Name of the column containing the original text.
output_column_prefix (str): Prefix for the column names of the results.
Returns:
None: The function modifies the `evaluation_df` DataFrame in-place,
adding new columns for the results.
"""
print("Run LLM Criteria Evaluation...")
for task_key, task_description in tasks.items():
column_name = f"{output_column_prefix}{task_key}"
def provider_evaluate(row: pd.Series, description: str) -> str:
return openai_provider.evaluate(
evaluate_input=row[transformed_column],
system_prompt="evaluate_task_comparison",
prompt_input=[row[original_column], description],
)
# using partial, as lambda inside loop leads to unexpected behavior,
# see https://pylint.readthedocs.io/en/latest/user_guide/messages/warning/cell-var-from-loop.html
evaluation_df[column_name] = evaluation_df.progress_apply(
partial(provider_evaluate, description=task_description),
axis=1,
)
evaluation_df[column_name] = evaluation_df[column_name].apply(convert_str_to_bool)
indices
Functions to calculate additional indices.
| FUNCTION | DESCRIPTION |
|---|---|
example_index |
An example index which serves as a template for new indices. |
llm_hallucination_index |
Performs an LLM-based hallucination evaluation on the original and transformed text. |
example_index
An example index which serves as a template for new indices.
Calculates the difference in length between original and transformed text.
| PARAMETER | DESCRIPTION |
|---|---|
original_text
|
The column containing the original text.
TYPE:
|
transformed_text
|
The column containing the transformed text.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
float
|
The difference in length between original and transformed text.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/indices.py
def example_index(
*,
original_text: str,
transformed_text: str,
) -> float:
"""An example index which serves as a template for new indices.
Calculates the difference in length between original and transformed text.
Args:
original_text (pd.Series): The column containing the original text.
transformed_text (pd.Series): The column containing the transformed text.
Returns:
float: The difference in length between original and transformed text.
"""
original_length = len(original_text)
transformed_length = len(transformed_text)
return float(transformed_length - original_length)
llm_hallucination_index
Performs an LLM-based hallucination evaluation on the original and transformed text.
The function checks for the transformed text if hallucinations occur in comparison to the original text.
| PARAMETER | DESCRIPTION |
|---|---|
original_text
|
The column containing the original text.
TYPE:
|
transformed_text
|
The column containing the transformed text.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
bool | None
|
bool or None: The result of the evaluation |
Source code in docs/repositories-clones/evaluation/src/indices.py
def llm_hallucination_index(
*,
original_text: str,
transformed_text: str,
) -> bool | None:
"""Performs an LLM-based hallucination evaluation on the original and transformed text.
The function checks for the transformed text if hallucinations occur in
comparison to the original text.
Args:
original_text (str): The column containing the original text.
transformed_text (str): The column containing the transformed text.
Returns:
bool or None: The result of the evaluation
"""
openai_provider = EvaluationProviderOpenAiLike(settings.llm_config.evaluation)
return convert_str_to_bool(
openai_provider.evaluate(
evaluate_input=transformed_text,
system_prompt="evaluate_hallucination",
prompt_input=original_text,
)
)
models
Pydantic models.
| MODULE | DESCRIPTION |
|---|---|
api_input |
pydantic Models for API input parameters. |
api_output |
pydantic Models for API output parameters. |
general |
Load and check settings from YAML. |
llm_input |
Pydantic models for LLM configuration. |
api_input
pydantic Models for API input parameters.
| CLASS | DESCRIPTION |
|---|---|
SimplifyInput |
Input model for /simplify endpoint to simplify text input. |
SimplifyInput
Bases: BaseModel
Input model for /simplify endpoint to simplify text input.
input_text (str): The text to be simplified. language_model (str): The identifier of the language model to use.
Source code in docs/repositories-clones/evaluation/src/models/api_input.py
api_output
pydantic Models for API output parameters.
| CLASS | DESCRIPTION |
|---|---|
SimplifyOutput |
Represents the result of a text simplification process. |
SimplifyOutput
Bases: BaseModel
Represents the result of a text simplification process.
| ATTRIBUTE | DESCRIPTION |
|---|---|
input_text |
The original input text.
TYPE:
|
simplified_text |
The simplified version of the input text.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/api_output.py
general
Load and check settings from YAML.
| CLASS | DESCRIPTION |
|---|---|
BaseTransformation |
Base type for a transformation. |
Dataset |
Represents a dataset of text samples. |
DatasetCreationSettings |
Contains specific settings for dataset creation. |
LLMConfig |
Configuration for the list of available large language models. |
PostConfig |
Configuration for async_post request to other microservices. |
Settings |
The combined settings for the evaluation. |
YamlSettings |
A settings class that can read YAML files. |
BaseTransformation
Bases: BaseModel
Base type for a transformation.
| ATTRIBUTE | DESCRIPTION |
|---|---|
type |
The type of the transformation.
TYPE:
|
label |
The human-readable label of the transformation.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/general.py
Dataset
Bases: BaseModel
Represents a dataset of text samples.
| ATTRIBUTE | DESCRIPTION |
|---|---|
examples |
The list of text samples contained in the dataset.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/general.py
DatasetCreationSettings
Bases: YamlSettings
Contains specific settings for dataset creation.
| ATTRIBUTE | DESCRIPTION |
|---|---|
dataset_name |
The name of the dataset, used in the output filename.
TYPE:
|
num_examples |
The number of examples to prompt the LLM for (might be inaccurate).
TYPE:
|
description |
Short description/title of the dataset.
TYPE:
|
criteria |
Positive criteria describing the target outcome.
TYPE:
|
llm_config |
Configuration of the LLM.
TYPE:
|
system_prompt |
The system prompt of the LLM.
TYPE:
|
user_prompt |
The user prompt template where num_examples, description and criteria can be inserted.
TYPE:
|
output_format |
Whether to output CSV or Excel files.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/general.py
class DatasetCreationSettings(YamlSettings):
"""Contains specific settings for dataset creation.
Attributes:
dataset_name (str): The name of the dataset, used in the output filename.
num_examples (str): The number of examples to prompt the LLM for (might be inaccurate).
description (str): Short description/title of the dataset.
criteria (str): Positive criteria describing the target outcome.
llm_config (DatasetCreationLLM): Configuration of the LLM.
system_prompt (str): The system prompt of the LLM.
user_prompt (str): The user prompt template where num_examples,
description and criteria can be inserted.
output_format (str): Whether to output CSV or Excel files.
"""
model_config = SettingsConfigDict(yaml_file="config/create_dataset.yaml", extra="forbid")
dataset_name: str
num_examples: str
description: str
criteria: str
llm_config: DatasetCreationLLM
system_prompt: str
user_prompt: str
output_format: Literal["csv", "xlsx"] = "csv"
LLMConfig
Bases: YamlSettings
Configuration for the list of available large language models.
| ATTRIBUTE | DESCRIPTION |
|---|---|
evaluation |
The configuration for the LLM used for evaluation.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/general.py
class LLMConfig(YamlSettings):
"""Configuration for the list of available large language models.
Attributes:
evaluation (EvaluationLLM): The configuration for the LLM used for evaluation.
"""
model_config = SettingsConfigDict(yaml_file="config/llm_parameters.yaml", extra="forbid")
evaluation: EvaluationLLM
PostConfig
Bases: BaseModel
Configuration for async_post request to other microservices.
The default values in this class can be overwritten by those values stated in configs/general.yml.
| ATTRIBUTE | DESCRIPTION |
|---|---|
model_config |
Used to ignore other services, which are defined in the config.
TYPE:
|
max_attempts |
Maximal number of requests before returning status code 424.
TYPE:
|
timeout_in_s |
Maximum waiting duration before timeout (in seconds).
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/general.py
class PostConfig(BaseModel):
"""Configuration for async_post request to other microservices.
The default values in this class can be overwritten by those values stated in configs/general.yml.
Attributes:
model_config (ConfigDict): Used to ignore other services, which are defined in the config.
max_attempts (PositiveInt): Maximal number of requests before returning status code 424.
timeout_in_s (PositiveInt): Maximum waiting duration before timeout (in seconds).
"""
model_config = ConfigDict(extra="ignore")
max_attempts: PositiveInt = 1
timeout_in_s: PositiveInt = 180
Settings
Bases: YamlSettings
The combined settings for the evaluation.
| ATTRIBUTE | DESCRIPTION |
|---|---|
experiment_name |
The name of the experiment, also used as a folder name.
TYPE:
|
backend_endpoint |
The url of the backend.
TYPE:
|
data_files |
The names of the data_files without file extension.
TYPE:
|
replications |
The number of replications to run.
TYPE:
|
input_column_name |
The name of the column with the input texts.
TYPE:
|
output_column_name |
The name of the column for the transformed texts.
TYPE:
|
transformations |
The transformations to apply to the text and evaluate.
TYPE:
|
tasks |
The tasks to use for the evaluation.
TYPE:
|
indices |
The indices to compute for the evaluation.
TYPE:
|
map |
The map from metrics to human-readable labels.
TYPE:
|
score_weighting |
The (non-normalized) weighting to calculate the score.
TYPE:
|
llm_tasks |
The task definitions for the tasks evaluated by the LLM.
TYPE:
|
llm_config |
The configuration for the LLM used for the evaluation.
TYPE:
|
transformed_data_filename |
Filename to store the transformed texts.
TYPE:
|
evaluation_output_filename |
Filename to store the single evaluation results.
TYPE:
|
transformation_metadata_filename |
Filename to strore metadata about the transformation.
TYPE:
|
transformation_label_column |
The name of the column where the transformation label is stored.
TYPE:
|
csv_separator |
The CSV separator used to read and write CSV files.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/general.py
class Settings(YamlSettings):
"""The combined settings for the evaluation.
Attributes:
experiment_name (str): The name of the experiment, also used as a folder name.
backend_endpoint (AnyHttpUrl): The url of the backend.
data_files (list[str]): The names of the data_files without file extension.
replications (PositiveInt): The number of replications to run.
input_column_name (str): The name of the column with the input texts.
output_column_name (str): The name of the column for the transformed texts.
transformations (dict): The transformations to apply to the text and evaluate.
tasks (list): The tasks to use for the evaluation.
indices (list): The indices to compute for the evaluation.
map (dict): The map from metrics to human-readable labels.
score_weighting (dict): The (non-normalized) weighting to calculate the score.
llm_tasks (dict): The task definitions for the tasks evaluated by the LLM.
llm_config (LLMConfig): The configuration for the LLM used for the evaluation.
transformed_data_filename (str): Filename to store the transformed texts.
evaluation_output_filename (str): Filename to store the single evaluation results.
transformation_metadata_filename (str): Filename to strore metadata about the transformation.
transformation_label_column (str): The name of the column where the transformation label is stored.
csv_separator (str): The CSV separator used to read and write CSV files.
"""
model_config = SettingsConfigDict(yaml_file="config/evaluation.yaml", extra="forbid")
experiment_name: str
backend_endpoint: AnyHttpUrl = "http://backend:8000/"
data_files: list[str]
replications: PositiveInt = 3
input_column_name: str
output_column_name: str = "Transformed"
transformations: dict[str, BaseTransformation]
tasks: list[str]
indices: list[str] | tuple[str] = tuple()
map: dict[str, str]
score_weighting: dict[str, NonNegativeFloat] = Field(default_factory=dict)
llm_tasks: dict[str, str]
llm_config: LLMConfig = LLMConfig()
transformed_data_filename: str = "input_transformed.csv"
evaluation_output_filename: str = "evaluation_results.csv"
transformation_metadata_filename: str = "transformation_metadata.yaml"
transformation_label_column: str = "Model"
csv_separator: str = ";"
_experiment_suffix: str = PrivateAttr(
default_factory=lambda: datetime.now().strftime("%Y%m%d-%H%M%S")
)
@computed_field
@property
def experiment_suffix(self) -> str:
"""The experiment suffix to distinguish different experiment runs."""
return self._experiment_suffix
YamlSettings
Bases: BaseSettings
A settings class that can read YAML files.
| METHOD | DESCRIPTION |
|---|---|
settings_customise_sources |
Define the sources and their order for loading the settings values. |
Source code in docs/repositories-clones/evaluation/src/models/general.py
class YamlSettings(BaseSettings):
"""A settings class that can read YAML files."""
@classmethod
def settings_customise_sources(
cls,
settings_cls: type[BaseSettings],
init_settings: PydanticBaseSettingsSource,
env_settings: PydanticBaseSettingsSource,
dotenv_settings: PydanticBaseSettingsSource,
file_secret_settings: PydanticBaseSettingsSource,
) -> tuple[PydanticBaseSettingsSource, ...]:
"""Define the sources and their order for loading the settings values.
Args:
settings_cls: The Settings class.
init_settings: The `InitSettingsSource` instance.
env_settings: The `EnvSettingsSource` instance.
dotenv_settings: The `DotEnvSettingsSource` instance.
file_secret_settings: The `SecretsSettingsSource` instance.
Returns:
A tuple containing the sources and their order for loading the settings values.
"""
return (
init_settings,
env_settings,
dotenv_settings,
file_secret_settings,
YamlConfigSettingsSource(settings_cls),
)
settings_customise_sources
classmethod
settings_customise_sources(settings_cls, init_settings, env_settings, dotenv_settings, file_secret_settings)
Define the sources and their order for loading the settings values.
| PARAMETER | DESCRIPTION |
|---|---|
settings_cls
|
The Settings class.
TYPE:
|
init_settings
|
The
TYPE:
|
env_settings
|
The
TYPE:
|
dotenv_settings
|
The
TYPE:
|
file_secret_settings
|
The
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
tuple[PydanticBaseSettingsSource, ...]
|
A tuple containing the sources and their order for loading the settings values. |
Source code in docs/repositories-clones/evaluation/src/models/general.py
@classmethod
def settings_customise_sources(
cls,
settings_cls: type[BaseSettings],
init_settings: PydanticBaseSettingsSource,
env_settings: PydanticBaseSettingsSource,
dotenv_settings: PydanticBaseSettingsSource,
file_secret_settings: PydanticBaseSettingsSource,
) -> tuple[PydanticBaseSettingsSource, ...]:
"""Define the sources and their order for loading the settings values.
Args:
settings_cls: The Settings class.
init_settings: The `InitSettingsSource` instance.
env_settings: The `EnvSettingsSource` instance.
dotenv_settings: The `DotEnvSettingsSource` instance.
file_secret_settings: The `SecretsSettingsSource` instance.
Returns:
A tuple containing the sources and their order for loading the settings values.
"""
return (
init_settings,
env_settings,
dotenv_settings,
file_secret_settings,
YamlConfigSettingsSource(settings_cls),
)
llm_input
Pydantic models for LLM configuration.
| CLASS | DESCRIPTION |
|---|---|
APIAuth |
Defines Authentification settings for LLM. |
DatasetCreationLLM |
Configuration of a Large Language Model for the dataset creation. |
EvaluationLLM |
Configuration of a Large Language Model. |
LLMAPI |
Defines API-Connection to LLM. |
LLMInference |
Defines Inference parameters. |
LLMPromptConfig |
Defines the structure of a LLM prompt configuration. |
LLMPrompts |
Defines the selectable LLM Prompts. |
APIAuth
Bases: BaseModel
Defines Authentification settings for LLM.
| ATTRIBUTE | DESCRIPTION |
|---|---|
secret_path |
File path where the api token or credentials are stored.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/llm_input.py
class APIAuth(BaseModel):
"""Defines Authentification settings for LLM.
Attributes:
secret_path (FilePath): File path where the api token or credentials are stored.
"""
secret_path: FilePath
@property
def secret(self) -> SecretStr:
"""The secret variable."""
with open(self.secret_path) as file:
return SecretStr(file.read().strip())
DatasetCreationLLM
Bases: BaseModel
Configuration of a Large Language Model for the dataset creation.
| ATTRIBUTE | DESCRIPTION |
|---|---|
label |
Model name which is used in API call, e.g. ollama tag.
TYPE:
|
api |
API information.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/llm_input.py
EvaluationLLM
Bases: BaseModel
Configuration of a Large Language Model.
| ATTRIBUTE | DESCRIPTION |
|---|---|
label |
Model name which is used in API call, e.g. ollama tag.
TYPE:
|
api |
API information.
TYPE:
|
inference |
Inference parameters.
TYPE:
|
prompt_yaml_file |
Path to prompts.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/llm_input.py
class EvaluationLLM(BaseModel):
"""Configuration of a Large Language Model.
Attributes:
label (str): Model name which is used in API call, e.g. ollama tag.
api (LLMAPI): API information.
inference (LLMInference): Inference parameters.
prompt_yaml_file (Path): Path to prompts.
"""
model_config = ConfigDict(extra="ignore")
label: str
api: LLMAPI
inference: LLMInference
prompt_yaml_file: Path
@property
def prompt_config(self) -> LLMPromptConfig:
"""The system prompts for the model read from a YAML file."""
return LLMPromptConfig(**_load_yml_config(self.prompt_yaml_file))
LLMAPI
Bases: BaseModel
Defines API-Connection to LLM.
| ATTRIBUTE | DESCRIPTION |
|---|---|
url |
URL to model.
TYPE:
|
auth |
Authentification settings for LLM
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/llm_input.py
LLMInference
Bases: BaseModel
Defines Inference parameters.
| ATTRIBUTE | DESCRIPTION |
|---|---|
temperature |
Randomness / variation of the output High values indicate more creativity.
TYPE:
|
top_p |
Threshold for sampling only from the most likely tokens.
TYPE:
|
frequency_penalty |
Reduces the likelihood of repeating tokens based on their existing frequency in the text.
TYPE:
|
presence_penalty |
Encourages the model to introduce new tokens by penalizing tokens that have already appeared.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/llm_input.py
class LLMInference(BaseModel):
"""Defines Inference parameters.
Attributes:
temperature (PositiveFloat): Randomness / variation of the output High values indicate more creativity.
top_p (PositiveFloat): Threshold for sampling only from the most likely tokens.
frequency_penalty (float): Reduces the likelihood of repeating tokens based on their existing frequency
in the text.
presence_penalty (float): Encourages the model to introduce new tokens by penalizing tokens that have
already appeared.
"""
temperature: float = 0.7
top_p: float = 1.0
frequency_penalty: float = 0.0
presence_penalty: float = 0.0
LLMPromptConfig
Bases: BaseModel
Defines the structure of a LLM prompt configuration.
| ATTRIBUTE | DESCRIPTION |
|---|---|
model_config |
Used to ignore other services, which are defined in the config.
TYPE:
|
system |
System prompt.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/llm_input.py
class LLMPromptConfig(BaseModel):
"""Defines the structure of a LLM prompt configuration.
Attributes:
model_config (ConfigDict): Used to ignore other services, which are defined in the config.
system (LLMPrompts): System prompt.
"""
model_config = ConfigDict(extra="ignore")
system_prompts: LLMPrompts
LLMPrompts
Bases: BaseModel
Defines the selectable LLM Prompts.
| ATTRIBUTE | DESCRIPTION |
|---|---|
model_config |
Used to ignore other services, which are defined in the config.
TYPE:
|
evaluate_hallucination |
Prompt for hallucination evaluation.
TYPE:
|
evaluate_task_comparison |
Prompt for task evaluation.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/models/llm_input.py
class LLMPrompts(BaseModel):
"""Defines the selectable LLM Prompts.
Attributes:
model_config (ConfigDict): Used to ignore other services, which are defined in the config.
evaluate_hallucination (str): Prompt for hallucination evaluation.
evaluate_task_comparison (str): Prompt for task evaluation.
"""
model_config = ConfigDict(extra="ignore")
evaluate_hallucination: str = ""
evaluate_task_comparison: str = ""
openai_evaluator
Class for text evaluation using OpenAI.
| CLASS | DESCRIPTION |
|---|---|
EvaluationProviderOpenAiLike |
Class for text evaluation with OpenAI-compatible LLM provider. |
EvaluationProviderOpenAiLike
Class for text evaluation with OpenAI-compatible LLM provider.
| ATTRIBUTE | DESCRIPTION |
|---|---|
llm |
Definition of the LLM configuration.
TYPE:
|
client |
OpenAI-like client performing chat completion.
TYPE:
|
| METHOD | DESCRIPTION |
|---|---|
evaluate |
Take an object of type stras input and return a model-generated readability evaluation as output. |
Source code in docs/repositories-clones/evaluation/src/openai_evaluator.py
class EvaluationProviderOpenAiLike:
"""Class for text evaluation with OpenAI-compatible LLM provider.
Attributes:
llm (LLM): Definition of the LLM configuration.
client (OpenAI): OpenAI-like client performing chat completion.
Methods:
evaluate: Take an object of type stras input and return a model-generated readability evaluation as output.
"""
def __init__(self, llm: EvaluationLLM) -> None:
"""Initialise the class.
Args:
llm (LLM): Definition of the LLM configuration.
"""
self.llm: EvaluationLLM = llm
self.client = OpenAI(
api_key=llm.api.auth.secret.get_secret_value(), base_url=str(llm.api.url)
)
def evaluate(
self,
evaluate_input: str,
system_prompt: str = "evaluate",
prompt_input: str | list | None = None,
) -> str:
"""Take a string as input and return a model-generated evaluation of the text as output.
Args:
evaluate_input (str): Input to the model.
system_prompt (str): Key of the evaluation prompt
prompt_input (str | list | None): Prompt input
Returns:
str: Model-generated response text.
"""
system_prompt_content = self.llm.prompt_config.system_prompts.dict().get(system_prompt)
if system_prompt_content:
if system_prompt in ["evaluate_task", "evaluate_hallucination"]:
system_prompt_content = system_prompt_content.format(prompt_input=prompt_input)
elif system_prompt == "evaluate_task_comparison":
system_prompt_content = system_prompt_content.format(
prompt_input_1=prompt_input[0], prompt_input_2=prompt_input[1]
)
else:
pass
messages = [
{"role": "system", "content": system_prompt_content},
{"role": "user", "content": evaluate_input},
]
logger.info(f"Messages: {messages}")
response: ChatCompletion = self._generate(messages, response_format="text")
content: str = response.choices[0].message.content # type: ignore
logger.info(f"Content: {content}")
return content
def _generate(self, messages: list, response_format: str = "text") -> ChatCompletion:
"""Take a list of messages as input and return a model-generated message as output.
Args:
messages (list): Messages as input to the model.
response_format (str): Format of the response.
Returns:
ChatCompletion: Model-generated response.
"""
try:
response: ChatCompletion = self.client.chat.completions.create(
model=self.llm.label,
messages=messages,
response_format={"type": response_format},
frequency_penalty=self.llm.inference.frequency_penalty,
presence_penalty=self.llm.inference.presence_penalty,
temperature=self.llm.inference.temperature,
top_p=self.llm.inference.top_p,
stream=False,
)
logger.debug(f"Response from LLM-Client: {response}")
except Exception as e:
msg = f"{self.llm.label} API call of Chat-Completion to LLM failed: {e}"
logger.error(msg)
raise RuntimeError(msg) from e
return response
evaluate
Take a string as input and return a model-generated evaluation of the text as output.
| PARAMETER | DESCRIPTION |
|---|---|
evaluate_input
|
Input to the model.
TYPE:
|
system_prompt
|
Key of the evaluation prompt
TYPE:
|
prompt_input
|
Prompt input
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
str
|
Model-generated response text.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/openai_evaluator.py
def evaluate(
self,
evaluate_input: str,
system_prompt: str = "evaluate",
prompt_input: str | list | None = None,
) -> str:
"""Take a string as input and return a model-generated evaluation of the text as output.
Args:
evaluate_input (str): Input to the model.
system_prompt (str): Key of the evaluation prompt
prompt_input (str | list | None): Prompt input
Returns:
str: Model-generated response text.
"""
system_prompt_content = self.llm.prompt_config.system_prompts.dict().get(system_prompt)
if system_prompt_content:
if system_prompt in ["evaluate_task", "evaluate_hallucination"]:
system_prompt_content = system_prompt_content.format(prompt_input=prompt_input)
elif system_prompt == "evaluate_task_comparison":
system_prompt_content = system_prompt_content.format(
prompt_input_1=prompt_input[0], prompt_input_2=prompt_input[1]
)
else:
pass
messages = [
{"role": "system", "content": system_prompt_content},
{"role": "user", "content": evaluate_input},
]
logger.info(f"Messages: {messages}")
response: ChatCompletion = self._generate(messages, response_format="text")
content: str = response.choices[0].message.content # type: ignore
logger.info(f"Content: {content}")
return content
request_handling
Utility function for making robust synchronous HTTP POST requests with retries and error handling.
| FUNCTION | DESCRIPTION |
|---|---|
post_with_retries |
Makes a synchronous POST request with retries via httpx and parses the response into a Pydantic model. |
post_with_retries
Makes a synchronous POST request with retries via httpx and parses the response into a Pydantic model.
| PARAMETER | DESCRIPTION |
|---|---|
url
|
The URL to post to.
TYPE:
|
response_model
|
Expected Pydantic response model.
TYPE:
|
config
|
Configuration with timeout and max_attempts.
TYPE:
|
request_options
|
Data and headers for the request.
TYPE:
|
service_name
|
Name of the service (for logging).
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
ServiceApiOutput | list[ServiceApiOutput]
|
An instance (or list) of the given response model populated with the response data. |
| RAISES | DESCRIPTION |
|---|---|
RuntimeError
|
If all retries fail or response validation fails. |
Source code in docs/repositories-clones/evaluation/src/request_handling.py
def post_with_retries(
url: str,
response_model: type[ServiceApiOutput],
config: PostConfig,
request_options: dict[str, Any],
service_name: str = "service",
) -> ServiceApiOutput | list[ServiceApiOutput]:
"""Makes a synchronous POST request with retries via httpx and parses the response into a Pydantic model.
Args:
url (str): The URL to post to.
response_model (Type[ServiceApiOutput]): Expected Pydantic response model.
config (PostConfig): Configuration with timeout and max_attempts.
request_options (dict): Data and headers for the request.
service_name (str): Name of the service (for logging).
Returns:
An instance (or list) of the given response model populated with the response data.
Raises:
RuntimeError: If all retries fail or response validation fails.
"""
logger.debug(f"Communication with {service_name} configured using {config}")
for attempt in range(1, config.max_attempts + 1):
if attempt > 1:
logger.warning(f"Retrying request to {service_name} ({attempt}/{config.max_attempts})")
try:
with httpx.Client(timeout=config.timeout_in_s) as client:
response = client.post(url, **request_options)
try:
result_dict = response.json()
except ValueError as e:
msg = f"{service_name} returned invalid JSON. Response: {response.text}"
logger.error(msg)
raise RuntimeError(msg) from e
if response.status_code == HTTPStatus.OK:
logger.debug(f"Response from {service_name}: {result_dict}")
try:
if isinstance(result_dict, list):
return [response_model.model_validate(item) for item in result_dict]
return response_model.model_validate(result_dict)
except (TypeError, ValidationError) as e:
msg = f"Invalid response structure for {service_name}: {e}"
logger.error(msg)
raise RuntimeError(msg) from e
else:
msg = (
f"{service_name} failed with status {response.status_code}. "
f"Response: {response.text}"
)
logger.error(msg)
raise RuntimeError(msg)
except httpx.RequestError as e:
if attempt < config.max_attempts:
logger.warning(
f"Request to {service_name} failed (attempt {attempt}): {e}. Retrying..."
)
time.sleep(3)
else:
msg = (
f"Could not connect to {service_name} after {config.max_attempts} attempts: {e}"
)
logger.critical(msg)
raise RuntimeError(msg) from e
settings
Loads the configuration and makes it accessible for other modules.
summarize
Script for extended and summarized evaluation.
| FUNCTION | DESCRIPTION |
|---|---|
barplot_mean_limits |
Plots a DataFrame as a bar chart. |
merge_evaluation_results |
Merges all evaluations stored in subfolders of the input folders. |
summarize |
The main function to run the summarization. |
summarize_evaluation_results |
Create and save summary plots and tables to the given output folder. |
barplot_mean_limits
Plots a DataFrame as a bar chart.
Includes minimum and maximum values and optional saving to file.
| PARAMETER | DESCRIPTION |
|---|---|
df
|
Data frame containing data for plotting.
TYPE:
|
save_path
|
Optional path to save the plot (default is None).
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/summarize.py
def barplot_mean_limits(df: pd.DataFrame, save_path: Path | None = None) -> None:
"""Plots a DataFrame as a bar chart.
Includes minimum and maximum values and optional saving to file.
Args:
df (pd.DataFrame): Data frame containing data for plotting.
save_path (Path): Optional path to save the plot (default is None).
"""
# Plot DataFrame using matplotlib by creating an Axes object, `ax`
yerr = np.stack(
[
(df.mean() - df.min()).values,
(df.max() - df.mean()).values,
],
axis=1,
)
# ensure positive values (sometimes very small negative values occur
# due to numerical errors)
yerr = np.maximum(0, yerr)
df.mean().T.plot(
kind="bar",
figsize=(9, 4),
width=0.8,
yerr=yerr,
capsize=2,
)
# Setting chart attributes
plt.title("Evaluation")
plt.ylabel(r"Values $[\uparrow]$")
plt.xticks(rotation=65) # Rotate x-axis labels if necessary for readability
# Setting legend position and title
plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left", title="Models", fontsize="small")
# Adjusting the layout to fit all elements properly
plt.subplots_adjust(bottom=0.35, right=0.65)
plt.grid(visible=True)
if save_path:
# Save plot as an image file
plt.savefig(save_path, dpi=300)
plt.close() # Close the plot to free memory if stored in a file.
else:
# Display the plot directly if no save path provided
plt.show()
merge_evaluation_results
Merges all evaluations stored in subfolders of the input folders.
| PARAMETER | DESCRIPTION |
|---|---|
input_folders
|
The input folders with the experiment results.
TYPE:
|
combined_experiments
|
If the merged results are from multiple experiments.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
merged_evaluation_df
|
The data frame containing the evaluation results for all transformations, as well as the calculated score.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/summarize.py
def merge_evaluation_results(
input_folders: list[Path],
*,
combined_experiments: bool,
) -> pd.DataFrame:
"""Merges all evaluations stored in subfolders of the input folders.
Args:
input_folders (list[Path]): The input folders with the experiment results.
combined_experiments (bool): If the merged results are from multiple experiments.
Returns:
merged_evaluation_df (pd.DataFrame): The data frame containing the
evaluation results for all transformations, as well as the
calculated score.
"""
llm_results_files = get_files_with_name(input_folders, settings.evaluation_output_filename)
eval_files = []
for file_path in llm_results_files:
with open(
file_path.parent / settings.transformation_metadata_filename,
encoding="utf-8",
) as metadata_file:
metadata = yaml.safe_load(metadata_file)
label = (
metadata.get("label_combined_experiments", metadata["label"])
if combined_experiments
else metadata["label"]
)
dataset_name = file_path.parent.parent.parent.name
eval_df = pd.read_csv(file_path, sep=settings.csv_separator)
eval_df["Run"] = file_path.parent.name
eval_df[settings.transformation_label_column] = label
eval_df["Dataset"] = dataset_name
eval_files.append(eval_df)
merged_evaluation_df = pd.concat(eval_files)
return merged_evaluation_df
summarize
The main function to run the summarization.
The parameters are provided by configuration files.
| PARAMETER | DESCRIPTION |
|---|---|
input_folders
|
The paths to the folders with the results.
TYPE:
|
output_folder
|
The path to the folder were the results of this run are stored.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/summarize.py
def summarize(input_folders: list[Path], output_folder: Path) -> None:
"""The main function to run the summarization.
The parameters are provided by configuration files.
Args:
input_folders (list[Path]): The paths to the folders with the results.
output_folder (Path): The path to the folder were the results of this run
are stored.
"""
# Mappings
ordered_transformation_labels = [t.label for t in settings.transformations.values()]
combined_experiments = len(input_folders) > 1
output_folder.mkdir(parents=True, exist_ok=True)
merged_df = merge_evaluation_results(
input_folders,
combined_experiments=combined_experiments,
)
summarize_evaluation_results(
output_folder,
merged_df,
ordered_transformation_labels,
combined_experiments=combined_experiments,
)
summarize_evaluation_results
summarize_evaluation_results(output_folder, merged_evaluation_df, ordered_transformation_labels, *, combined_experiments)
Create and save summary plots and tables to the given output folder.
| PARAMETER | DESCRIPTION |
|---|---|
output_folder
|
The folder to store the summarized results.
TYPE:
|
merged_evaluation_df
|
The data frame with the evaluation results for all transformations.
TYPE:
|
ordered_transformation_labels
|
The labels of the transformations in the order they will appear in the output.
TYPE:
|
combined_experiments
|
If the merged results are from multiple experiments.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/summarize.py
def summarize_evaluation_results(
output_folder: list[Path],
merged_evaluation_df: pd.DataFrame,
ordered_transformation_labels: list[str],
*,
combined_experiments: bool,
) -> None:
"""Create and save summary plots and tables to the given output folder.
Args:
output_folder (Path): The folder to store the summarized results.
merged_evaluation_df (pd.DataFrame): The data frame with the evaluation
results for all transformations.
ordered_transformation_labels (list): The labels of the transformations
in the order they will appear in the output.
combined_experiments (bool): If the merged results are from multiple experiments.
"""
if not combined_experiments:
# restore the order given in the config, not implemented for multiple experiments
merged_evaluation_df[settings.transformation_label_column] = pd.Categorical(
merged_evaluation_df[settings.transformation_label_column],
categories=ordered_transformation_labels,
ordered=True,
)
merged_evaluation_df = merged_evaluation_df.sort_values(
settings.transformation_label_column
)
# rename columns from identifiers to human-readable labels
merged_evaluation_df = merged_evaluation_df.rename(columns=settings.map)
# store as CSV and Excel
print("Saving detailed results to file:")
detailed_results_csv_filepath = output_folder / "detailed_results.csv"
print(f"- CSV: '{detailed_results_csv_filepath}'")
merged_evaluation_df.to_csv(detailed_results_csv_filepath, sep=settings.csv_separator)
detailed_results_excel_filepath = output_folder / "detailed_results.xlsx"
print(f"- Excel: '{detailed_results_excel_filepath}'")
merged_evaluation_df.to_excel(detailed_results_excel_filepath)
# select numeric and boolean columns for summary output
numeric_bool_columns = merged_evaluation_df.select_dtypes(
include=["number", "bool"]
).columns.tolist()
# score is specified manually below so that it is always included, remove it here
score_name = settings.map.get("Score", "Score")
if score_name in numeric_bool_columns:
numeric_bool_columns.remove(score_name)
selected_columns = [
"Run",
settings.transformation_label_column,
"Dataset",
score_name,
] + numeric_bool_columns
# filter out non-existing columns (e.g., Score might be missing)
selected_columns = [name for name in selected_columns if name in merged_evaluation_df.columns]
# calculate mean step by step, relevant if the number of replications
# or number of examples differs between datasets
run_vals = (
# select columns for grouping and the relevant numeric values to average
merged_evaluation_df[selected_columns]
# average over examples (mean of a given dataset, model and replication)
.groupby(by=["Run", settings.transformation_label_column, "Dataset"], observed=True)
.mean()
# average over datasets (mean of a replication and model)
.groupby(by=[settings.transformation_label_column, "Run"], observed=True)
.mean()
)
# save barplot of the averaged results
barplot_filepath = output_folder / "summary.png"
print(f"Saving summarized results as barplot to '{barplot_filepath}'")
barplot_mean_limits(
run_vals.groupby(by=[settings.transformation_label_column], observed=True),
barplot_filepath,
)
print("Saving summary table:")
run_vals = run_vals.groupby(by=[settings.transformation_label_column], observed=True)
summary_table_markdown_filepath = output_folder / "summary.md"
print(f"- Markdown: {summary_table_markdown_filepath}")
save_evaluation_to_md(run_vals, summary_table_markdown_filepath, score_name=score_name)
# store statistics calculated over the replications
run_vals.describe().to_csv(
output_folder / "summary_statistics_replications.csv",
sep=settings.csv_separator,
)
run_vals.describe().to_excel(output_folder / "summary_statistics_replications.xlsx")
# store averaged results as CSV and Excel
summary_table_csv_filepath = output_folder / "summary.csv"
print(f"- CSV: {summary_table_csv_filepath}")
run_vals.mean().to_csv(summary_table_csv_filepath, sep=settings.csv_separator)
summary_table_excel_filepath = output_folder / "summary.xlsx"
print(f"- Excel: {summary_table_excel_filepath}")
run_vals.mean().to_excel(summary_table_excel_filepath)
transform
Script for LLM-Evaluation.
| FUNCTION | DESCRIPTION |
|---|---|
copy_column |
Copies the texts in the input_column of a DataFrame to output_column. |
execute_transform |
Transform the text and save original and transformed text to disk. |
generate |
Sends a SimplifyInput object to a backend service and returns the simplified text. |
transform |
The main function to run the transformation. |
transform_column |
Transforms the texts in the input_column of a DataFrame using the backend. |
copy_column
Copies the texts in the input_column of a DataFrame to output_column.
| PARAMETER | DESCRIPTION |
|---|---|
df
|
The DataFrame containing the texts to be transformed in the input_column.
TYPE:
|
input_column
|
The name of the column to be transformed.
TYPE:
|
output_column
|
The name of the column where the result is stored.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
DataFrame
|
pd.DataFrame: The updated DataFrame with a new output_column containing the copied texts. |
| RAISES | DESCRIPTION |
|---|---|
ValueError
|
The input_column is not in the provided data frame. |
Source code in docs/repositories-clones/evaluation/src/transform.py
def copy_column(
df: pd.DataFrame,
*,
input_column: str,
output_column: str,
) -> pd.DataFrame:
"""Copies the texts in the input_column of a DataFrame to output_column.
Args:
df (pd.DataFrame): The DataFrame containing the texts to be transformed in the input_column.
input_column (str): The name of the column to be transformed.
output_column (str): The name of the column where the result is stored.
Returns:
pd.DataFrame: The updated DataFrame with a new output_column containing the copied texts.
Raises:
ValueError: The input_column is not in the provided data frame.
"""
print("Copying...")
try:
df[output_column] = df[input_column]
return df
except KeyError as e:
raise ValueError(
f"The column to copy ('{input_column}') is not present in the provided data frame.\n"
f"Available columns are: {list(df.columns)}.\nPlease adapt your configuration "
"or add the column to your data file."
) from e
execute_transform
Transform the text and save original and transformed text to disk.
| PARAMETER | DESCRIPTION |
|---|---|
name
|
The name of the transformation.
TYPE:
|
label
|
The label of the transformation for end users.
TYPE:
|
data_filename
|
The name of the data file in the data folder, without the .csv file ending.
TYPE:
|
results_folder
|
The folder to store the transformed texts.
TYPE:
|
transform_fn
|
A transform function that receives the original dataframe and returns the modified dataframe. Usually the function adds a column with the transformed text.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/transform.py
def execute_transform(
*,
name: str,
label: str,
data_filename: str,
results_folder: Path,
transform_fn: Callable,
) -> None:
"""Transform the text and save original and transformed text to disk.
Args:
name (str): The name of the transformation.
label (str): The label of the transformation for end users.
data_filename (str): The name of the data file in the data folder, without
the .csv file ending.
results_folder (Path): The folder to store the transformed texts.
transform_fn (Callable): A transform function that receives the original
dataframe and returns the modified dataframe. Usually the function
adds a column with the transformed text.
"""
# read the input data
data_folder = Path("data")
csv_data_path = data_folder / f"{data_filename}.csv"
xlsx_data_path = data_folder / f"{data_filename}.xlsx"
if csv_data_path.is_file():
df = pd.read_csv(csv_data_path, delimiter=settings.csv_separator)
data_path = csv_data_path
if xlsx_data_path.is_file():
print(
f"{data_filename}: Both CSV ({csv_data_path}) and Excel ({xlsx_data_path}) files found. Using CSV..."
)
elif xlsx_data_path.is_file():
df = pd.read_excel(xlsx_data_path)
data_path = xlsx_data_path
else:
raise ValueError(
f"Could not find input file for data filename {data_filename}. "
f"Neither '{csv_data_path}' nor '{xlsx_data_path}' exist."
)
# execute the transformation
print(f"Transforming text for: {name}...")
transformed_df = transform_fn(df=df)
if transformed_df is None:
print("Transformation did not succeed.")
return
# ensure that the output folder exists
results_folder.mkdir(parents=True, exist_ok=True)
# store relevant metadata
metadata = {
"name": name,
"label": label,
# more detailed label to distinguish between experiments with the same name
"label_combined_experiments": f"{label} ({settings.experiment_name} - {settings.experiment_suffix})",
"data_path": str(data_path),
}
with open(
results_folder / settings.transformation_metadata_filename,
"w",
encoding="utf-8",
) as file:
yaml.dump(metadata, file, default_flow_style=False)
transformed_data_filepath = results_folder / settings.transformed_data_filename
print(f"Saving original and transformed text to '{transformed_data_filepath}'")
save_dataframe(transformed_df, transformed_data_filepath)
generate
Sends a SimplifyInput object to a backend service and returns the simplified text.
The function serializes the input Pydantic model to JSON, constructs the full URL from a base URL and the given endpoint, and performs a synchronous HTTP POST request with retries. The response is validated against the SimplifyOutput model.
| PARAMETER | DESCRIPTION |
|---|---|
input_text
|
The input data to be simplified, as a Pydantic model.
TYPE:
|
endpoint
|
The API endpoint to append to the backend base URL.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
str
|
The simplified text returned by the backend service.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/transform.py
def generate(input_text: SimplifyInput, endpoint: str) -> str:
"""Sends a SimplifyInput object to a backend service and returns the simplified text.
The function serializes the input Pydantic model to JSON, constructs the full
URL from a base URL and the given endpoint, and performs a synchronous HTTP POST
request with retries. The response is validated against the SimplifyOutput model.
Args:
input_text (SimplifyInput): The input data to be simplified, as a Pydantic model.
endpoint (str): The API endpoint to append to the backend base URL.
Returns:
str: The simplified text returned by the backend service.
"""
url = urljoin(str(settings.backend_endpoint), endpoint)
json = input_text.model_dump()
post_config = PostConfig()
headers = {"Content-Type": "application/json", "Accept": "application/json"}
request_options = {
"json": json,
"headers": headers,
}
result = post_with_retries(
url=url,
response_model=SimplifyOutput,
config=post_config,
request_options=request_options,
service_name="Chat",
)
return result.simplified_text
transform
The main function to run the transformation.
The parameters are provided by configuration files.
| PARAMETER | DESCRIPTION |
|---|---|
results_folder
|
The path to the folder were the results of this run are stored.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/transform.py
def transform(results_folder: Path) -> None:
"""The main function to run the transformation.
The parameters are provided by configuration files.
Args:
results_folder (Path): The path to the folder were the results of this run
are stored.
"""
input_column = settings.input_column_name
output_column = settings.output_column_name
transformations = {}
for transformation_name, transformation in settings.transformations.items():
match transformation.type:
case "manual":
transformations[transformation_name] = {
"function": partial(
copy_column,
input_column=transformation.column,
output_column=output_column,
),
"label": transformation.label,
}
case "backend":
transformations[transformation_name] = {
"function": partial(
transform_column,
endpoint="/simplify",
language_model=transformation.model_name,
input_column=input_column,
output_column=output_column,
),
"label": transformation.label,
}
case _:
print(f"Invalid configuration. Unknown transformation type '{transformation.type}'")
sys.exit(1)
results_folder.mkdir(parents=True, exist_ok=True)
for name, transformation in transformations.items():
for data_file in settings.data_files:
for replication in range(settings.replications):
execute_transform(
name=name,
label=transformation["label"],
data_filename=data_file,
results_folder=results_folder / data_file / name / f"{replication + 1:02}",
transform_fn=transformation["function"],
)
transform_column
Transforms the texts in the input_column of a DataFrame using the backend.
The function applies the text transformation to each entry in the input_column
by converting the text into a SimplifyInput object and using the generate function to generate
the transformed version of the text. The result is stored in a new column output_column.
| PARAMETER | DESCRIPTION |
|---|---|
df
|
The DataFrame containing the texts to be transformed in the input_column.
TYPE:
|
endpoint
|
Backend endpoint used to perform generation.
TYPE:
|
language_model
|
The LLM from the backend used to generate text. Needs to be listed as active LLM in the backend configuration.
TYPE:
|
input_column
|
The name of the column to be transformed.
TYPE:
|
output_column
|
The name of the column where the result is stored.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
DataFrame
|
pd.DataFrame: The updated DataFrame with a new output_column containing the simplified texts. |
Source code in docs/repositories-clones/evaluation/src/transform.py
def transform_column(
df: pd.DataFrame,
endpoint: str,
language_model: str,
*,
input_column: str,
output_column: str,
) -> pd.DataFrame:
"""Transforms the texts in the input_column of a DataFrame using the backend.
The function applies the text transformation to each entry in the input_column
by converting the text into a `SimplifyInput` object and using the `generate` function to generate
the transformed version of the text. The result is stored in a new column output_column.
Args:
df (pd.DataFrame): The DataFrame containing the texts to be transformed in the input_column.
endpoint (str): Backend endpoint used to perform generation.
language_model (str): The LLM from the backend used to generate text. Needs to be listed
as active LLM in the backend configuration.
input_column (str): The name of the column to be transformed.
output_column (str): The name of the column where the result is stored.
Returns:
pd.DataFrame: The updated DataFrame with a new output_column containing the simplified texts.
"""
print("Transforming...")
def parse_to_simplify_input(input_text: str) -> SimplifyInput:
return SimplifyInput(input_text=input_text, language_model=language_model)
df[output_column] = (
df[input_column]
.apply(parse_to_simplify_input)
.progress_apply(lambda x: generate(x, endpoint=endpoint))
)
return df
utils
Utility functions.
| FUNCTION | DESCRIPTION |
|---|---|
calculate_weighted_average |
Calculates the weighted average for a given weighting. |
convert_str_to_bool |
Converts a string to a Boolean value. |
get_files_with_name |
Finds all files with a given filename in a directory and its subdirectories. |
save_dataframe |
Saves the given DataFrame as a CSV file. |
save_dataframe_to_md |
Saves a dataframe to a Markdown file. |
save_evaluation_to_md |
Saves a dataframe with mean, min and max to a Markdown file. |
sort_columns_semi_manual |
Arranges the data frame with the columns in front first and the rest sorted alphabetically. |
calculate_weighted_average
Calculates the weighted average for a given weighting.
The average is stored in a new column in-place.
| PARAMETER | DESCRIPTION |
|---|---|
df
|
The dataframe containing the values
TYPE:
|
weighting
|
The weighting as dictionary with the column names as keys and the corresponding weights as values (weights do not have to sum to one).
TYPE:
|
result_column
|
Name of the column to store the result in.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/utils.py
def calculate_weighted_average(
df: pd.DataFrame, weighting: dict[str, float], result_column: str
) -> None:
"""Calculates the weighted average for a given weighting.
The average is stored in a new column in-place.
Args:
df (pd.DataFrame): The dataframe containing the values
weighting (dict[str, float]): The weighting as dictionary with the column
names as keys and the corresponding weights as values (weights do not have
to sum to one).
result_column (str): Name of the column to store the result in.
"""
weighting_keys = list(weighting.keys())
weighting_values = list(weighting.values())
df[result_column] = df[weighting_keys].mul(weighting).sum(axis=1) / sum(weighting_values)
convert_str_to_bool
Converts a string to a Boolean value.
If the string is "true" (case-insensitive), True is returned.
If the string is "false" (case-insensitive), False is returned.
For all other input values, None is returned.
| PARAMETER | DESCRIPTION |
|---|---|
value
|
The value as a string that should be converted to a Boolean.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
bool | None
|
bool or None: The converted Boolean value ( |
Source code in docs/repositories-clones/evaluation/src/utils.py
def convert_str_to_bool(value: str) -> bool | None:
"""Converts a string to a Boolean value.
If the string is "true" (case-insensitive), `True` is returned.
If the string is "false" (case-insensitive), `False` is returned.
For all other input values, `None` is returned.
Args:
value (str): The value as a string that should be converted to a Boolean.
Returns:
bool or None: The converted Boolean value (`True` or `False`), or `None`
if the string contains a different value.
"""
if value.lower() == "true":
return True
if value.lower() == "false":
return False
return None
get_files_with_name
Finds all files with a given filename in a directory and its subdirectories.
| PARAMETER | DESCRIPTION |
|---|---|
directory
|
The root directory/directories of the search.
TYPE:
|
filename
|
The filename to search for.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
list[Path]
|
list of Path: The files with name filename. |
Source code in docs/repositories-clones/evaluation/src/utils.py
def get_files_with_name(directory: Path | list[Path], filename: str) -> list[Path]:
"""Finds all files with a given filename in a directory and its subdirectories.
Args:
directory (Path or list[Path]): The root directory/directories of the search.
filename (str): The filename to search for.
Returns:
list of Path: The files with name filename.
"""
paths = []
if isinstance(directory, Path):
directory = [directory]
for root_dir in directory:
for subdir, _, files in Path(root_dir).walk():
if filename in files:
paths.append(subdir / filename)
return paths
save_dataframe
Saves the given DataFrame as a CSV file.
| PARAMETER | DESCRIPTION |
|---|---|
df
|
The DataFrame to be saved.
TYPE:
|
filepath
|
The path to save the dataframe.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/utils.py
save_dataframe_to_md
Saves a dataframe to a Markdown file.
| PARAMETER | DESCRIPTION |
|---|---|
df
|
The dataframe to save.
TYPE:
|
file_path
|
The location to save it.
TYPE:
|
score_name
|
The name of the score column which is listed first.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/utils.py
def save_dataframe_to_md(df: pd.DataFrame, file_path: Path, score_name: str = "Score") -> None:
"""Saves a dataframe to a Markdown file.
Args:
df (pd.DataFrame): The dataframe to save.
file_path (Path): The location to save it.
score_name (str): The name of the score column which is listed first.
"""
df = sort_columns_semi_manual(df, [score_name])
tab = df.fillna("--").to_markdown(floatfmt=".2f")
# Save to file
with open(file_path, "w", encoding="utf-8") as f:
f.write(tab + "\n")
save_evaluation_to_md
Saves a dataframe with mean, min and max to a Markdown file.
| PARAMETER | DESCRIPTION |
|---|---|
grouped_df
|
The DataFrameGroupBy or DataFrame object to save.
TYPE:
|
file_path
|
The location to save it.
TYPE:
|
score_name
|
The name of the score column which is listed first.
TYPE:
|
Source code in docs/repositories-clones/evaluation/src/utils.py
def save_evaluation_to_md(
grouped_df: pd.core.groupby.generic.DataFrameGroupBy | pd.DataFrame,
file_path: Path,
score_name: str = "Score",
) -> None:
"""Saves a dataframe with mean, min and max to a Markdown file.
Args:
grouped_df (pd.core.groupby.generic.DataFrameGroupBy | pd.DataFrame):
The DataFrameGroupBy or DataFrame object to save.
file_path (Path): The location to save it.
score_name (str): The name of the score column which is listed first.
"""
stats_df = grouped_df.describe(percentiles=[])
stats_df = stats_df.drop(["count", "std"], axis=1, level=1)
mean_df = grouped_df.mean()
for col in mean_df.columns:
if col not in stats_df.columns.get_level_values(0):
continue
mean_df[col] = [
f"{stats_df[col].loc[row]['mean']:.2f} "
f"({stats_df[col].loc[row]['min']:.2f} - "
f"{stats_df[col].loc[row]['max']:.2f})"
for row in stats_df[col].index
]
save_dataframe_to_md(mean_df, file_path, score_name)
sort_columns_semi_manual
Arranges the data frame with the columns in front first and the rest sorted alphabetically.
The function filters out non-existing columns in front.
| PARAMETER | DESCRIPTION |
|---|---|
df
|
The data frame to sort.
TYPE:
|
front
|
The first column names in the desired order.
TYPE:
|
kwargs
|
additional arguments passed to the sorted function.
TYPE:
|
| RETURNS | DESCRIPTION |
|---|---|
DataFrame
|
pd.DataFrame: Data frame with the reordered columns. |
Source code in docs/repositories-clones/evaluation/src/utils.py
def sort_columns_semi_manual(
df: pd.DataFrame, front: list[str] = (), **kwargs: dict
) -> pd.DataFrame:
"""Arranges the data frame with the columns in front first and the rest sorted alphabetically.
The function filters out non-existing columns in `front`.
Args:
df (pd.DataFrame): The data frame to sort.
front (list of strings): The first column names in the desired order.
kwargs (dict): additional arguments passed to the sorted function.
Returns:
pd.DataFrame: Data frame with the reordered columns.
"""
# filter non-existing columns
front = [col for col in front if col in df.columns]
cols = list(front) + sorted([col for col in df.columns if col not in front], **kwargs)
return df[cols]