Skip to content

Commit a4065c9

Browse files
authored
Remove unused function of Pipelines (#3330)
1 parent e9d24e5 commit a4065c9

File tree

1 file changed

+2
-134
lines changed
  • pipelines/pipelines/pipelines

1 file changed

+2
-134
lines changed

pipelines/pipelines/pipelines/base.py

Lines changed: 2 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
get_pipeline_definition,
3636
read_pipeline_config_from_yaml,
3737
)
38+
from pipelines.schema import Document, Label, MultiLabel
3839
from pipelines.pipelines.utils import generate_code
3940

4041
try:
@@ -203,16 +204,10 @@ def load_from_config(cls,
203204
pipeline_name=pipeline_name,
204205
overwrite_with_env_variables=overwrite_with_env_variables,
205206
)
206-
elif pipeline_definition["type"] == "RayPipeline":
207-
return RayPipeline.load_from_config(
208-
pipeline_config=pipeline_config,
209-
pipeline_name=pipeline_name,
210-
overwrite_with_env_variables=overwrite_with_env_variables,
211-
)
212207
else:
213208
raise KeyError(
214209
f"Pipeline Type '{pipeline_definition['type']}' is not a valid. The available types are"
215-
f"'Pipeline' and 'RayPipeline'.")
210+
f"'Pipeline'.")
216211

217212
@classmethod
218213
def load_from_yaml(cls,
@@ -528,133 +523,6 @@ def _reorder_columns(self, df: DataFrame,
528523
assert len(reordered_columns) == len(df.columns)
529524
return df.reindex(columns=reordered_columns)
530525

531-
def _build_eval_dataframe(self, query: str, query_labels: MultiLabel,
532-
node_name: str, node_output: dict) -> DataFrame:
533-
"""
534-
Builds a Dataframe for each query from which evaluation metrics can be calculated.
535-
Currently only answer or document returning nodes are supported, returns None otherwise.
536-
537-
Each row contains either an answer or a document that has been retrieved during evaluation.
538-
Rows are being enriched with basic infos like rank, query, type or node.
539-
Additional answer or document specific evaluation infos like gold labels
540-
and metrics depicting whether the row matches the gold labels are included, too.
541-
"""
542-
543-
if query_labels is None or query_labels.labels is None:
544-
logger.warning(
545-
f"There is no label for query '{query}'. Query will be omitted."
546-
)
547-
return pd.DataFrame()
548-
549-
# remarks for no_answers:
550-
# Single 'no_answer'-labels are not contained in MultiLabel aggregates.
551-
# If all labels are no_answers, MultiLabel.answers will be [""] and the other aggregates []
552-
gold_answers = query_labels.answers
553-
gold_offsets_in_documents = query_labels.gold_offsets_in_documents
554-
gold_document_ids = query_labels.document_ids
555-
gold_document_contents = query_labels.document_contents
556-
557-
# if node returned answers, include answer specific info:
558-
# - the answer returned itself
559-
# - the document_id the answer was found in
560-
# - the position or offsets within the document the answer was found
561-
# - the surrounding context of the answer within the document
562-
# - the gold answers
563-
# - the position or offsets of the gold answer within the document
564-
# - the gold document ids containing the answer
565-
# - the exact_match metric depicting if the answer exactly matches the gold label
566-
# - the f1 metric depicting how well the answer overlaps with the gold label on token basis
567-
# - the sas metric depicting how well the answer matches the gold label on a semantic basis.
568-
# this will be calculated on all queries in eval() for performance reasons if a sas model has been provided
569-
570-
partial_dfs = []
571-
for field_name in ["answers", "answers_isolated"]:
572-
df = pd.DataFrame()
573-
answers = node_output.get(field_name, None)
574-
if answers is not None:
575-
answer_cols_to_keep = [
576-
"answer", "document_id", "offsets_in_document", "context"
577-
]
578-
df_answers = pd.DataFrame(answers, columns=answer_cols_to_keep)
579-
if len(df_answers) > 0:
580-
df_answers["type"] = "answer"
581-
df_answers["gold_answers"] = [gold_answers
582-
] * len(df_answers)
583-
df_answers["gold_offsets_in_documents"] = [
584-
gold_offsets_in_documents
585-
] * len(df_answers)
586-
df_answers["gold_document_ids"] = [gold_document_ids
587-
] * len(df_answers)
588-
df_answers["exact_match"] = df_answers.apply(
589-
lambda row: calculate_em_str_multi(
590-
gold_answers, row["answer"]),
591-
axis=1)
592-
df_answers["f1"] = df_answers.apply(
593-
lambda row: calculate_f1_str_multi(
594-
gold_answers, row["answer"]),
595-
axis=1)
596-
df_answers["rank"] = np.arange(1, len(df_answers) + 1)
597-
df = pd.concat([df, df_answers])
598-
599-
# add general info
600-
df["node"] = node_name
601-
df["multilabel_id"] = query_labels.id
602-
df["query"] = query
603-
df["filters"] = json.dumps(query_labels.filters,
604-
sort_keys=True).encode()
605-
df["eval_mode"] = "isolated" if "isolated" in field_name else "integrated"
606-
partial_dfs.append(df)
607-
608-
# if node returned documents, include document specific info:
609-
# - the document_id
610-
# - the content of the document
611-
# - the gold document ids
612-
# - the gold document contents
613-
# - the gold_id_match metric depicting whether one of the gold document ids matches the document
614-
# - the answer_match metric depicting whether the document contains the answer
615-
# - the gold_id_or_answer_match metric depicting whether one of the former two conditions are met
616-
for field_name in ["documents", "documents_isolated"]:
617-
df = pd.DataFrame()
618-
documents = node_output.get(field_name, None)
619-
if documents is not None:
620-
document_cols_to_keep = ["content", "id"]
621-
df_docs = pd.DataFrame(documents, columns=document_cols_to_keep)
622-
if len(df_docs) > 0:
623-
df_docs = df_docs.rename(columns={"id": "document_id"})
624-
df_docs["type"] = "document"
625-
df_docs["gold_document_ids"] = [gold_document_ids
626-
] * len(df_docs)
627-
df_docs["gold_document_contents"] = [
628-
gold_document_contents
629-
] * len(df_docs)
630-
df_docs["gold_id_match"] = df_docs.apply(
631-
lambda row: 1.0
632-
if row["document_id"] in gold_document_ids else 0.0,
633-
axis=1)
634-
df_docs["answer_match"] = df_docs.apply(
635-
lambda row: 1.0 if not query_labels.no_answer and any(
636-
gold_answer in row["content"]
637-
for gold_answer in gold_answers) else 0.0,
638-
axis=1,
639-
)
640-
df_docs["gold_id_or_answer_match"] = df_docs.apply(
641-
lambda row: max(row["gold_id_match"], row["answer_match"
642-
]),
643-
axis=1)
644-
df_docs["rank"] = np.arange(1, len(df_docs) + 1)
645-
df = pd.concat([df, df_docs])
646-
647-
# add general info
648-
df["node"] = node_name
649-
df["multilabel_id"] = query_labels.id
650-
df["query"] = query
651-
df["filters"] = json.dumps(query_labels.filters,
652-
sort_keys=True).encode()
653-
df["eval_mode"] = "isolated" if "isolated" in field_name else "integrated"
654-
partial_dfs.append(df)
655-
656-
return pd.concat(partial_dfs, ignore_index=True)
657-
658526
def get_next_nodes(self, node_id: str, stream_id: str):
659527
current_node_edges = self.graph.edges(node_id, data=True)
660528
next_nodes = [

0 commit comments

Comments
 (0)