Source code for galahad.formats

import copy
from dataclasses import dataclass
from typing import List

from galahad.server.annotations import Annotations
from galahad.server.classifier import AnnotationFeatures, AnnotationTypes
from galahad.server.dataclasses import Annotation, Document


[docs]@dataclass class Span: begin: int end: int value: str
[docs]def build_sentence_classification_document(sentences: List[str], labels: List[str], version: int = 0) -> Document: assert len(sentences) == len(labels), "Sentences and labels need to have the same length!" text = " ".join(sentences) annotations = Annotations(text) sentence_type = AnnotationTypes.SENTENCE.value sentence_annotation_type = AnnotationTypes.ANNOTATION.value value_feature = AnnotationFeatures.VALUE.value begin = 0 for sentence, label in zip(sentences, labels): end = begin + len(sentence) annotation = annotations.create_annotation(sentence_type, begin, end) assert annotations.get_covered_text(annotation) == sentence annotation = annotations.create_annotation(sentence_annotation_type, begin, end, {value_feature: label}) assert annotations.get_covered_text(annotation) == sentence begin = end + 1 document = Document(text=text, annotations=annotations.to_dict(), version=version) return document
[docs]def build_span_classification_request( sentences: List[List[str]], spans: List[List[Span]] = None, version: int = 0 ) -> Document: text = " ".join(t for sentence in sentences for t in sentence) annotations = Annotations(text) token_idx_to_begins = {} token_idx_to_ends = {} token_type = AnnotationTypes.TOKEN.value span_annotation_type = AnnotationTypes.ANNOTATION.value value_feature = AnnotationFeatures.VALUE.value begin = 0 end = 0 for sentence_idx, sentence in enumerate(sentences): sentence_start = begin for token_idx, token_text in enumerate(sentence): end = begin + len(token_text) token_idx_to_begins[(sentence_idx, token_idx)] = begin token_idx_to_ends[(sentence_idx, token_idx)] = end annotation = annotations.create_annotation(token_type, begin, end) assert annotations.get_covered_text(annotation) == token_text begin = end + 1 annotations.create_annotation(AnnotationTypes.SENTENCE.value, sentence_start, end) for sentence_idx, sentence in enumerate(spans or []): for span in sentence: begin = token_idx_to_begins[(sentence_idx, span.begin)] end = token_idx_to_ends[(sentence_idx, span.end - 1)] annotations.create_annotation(span_annotation_type, begin, end, {value_feature: span.value}) document = Document(text=text, annotations=annotations.to_dict(), version=version) return document
[docs]def build_span_classification_response(original_doc: Document, spans: List[Span] = None, version: int = 0) -> Document: annotated_doc = copy.deepcopy(original_doc) annotated_doc.version = version assert AnnotationTypes.TOKEN.value in original_doc.annotations assert AnnotationTypes.SENTENCE.value in original_doc.annotations annotations = Annotations.from_dict(annotated_doc.text, annotated_doc.annotations) sentences = annotations.select(AnnotationTypes.SENTENCE.value) dummy_text_annotation = {"begin": sentences[0].begin, "end": sentences[-1].end} all_tokens = annotations.select_covered(AnnotationTypes.TOKEN.value, Annotation(**dummy_text_annotation)) for span in spans: first_token = all_tokens[span.begin] last_token = all_tokens[span.end - 1] annotations.create_annotation( AnnotationTypes.ANNOTATION.value, first_token.begin, last_token.end, {AnnotationFeatures.VALUE.value: span.value}, ) annotated_doc.annotations = annotations.get_annotations() return annotated_doc
[docs]def build_doc_from_tokens_and_text(text: str, sentences: List[List[str]]) -> Document: sentence_list = [] token_list = [] position = 0 subtext = text sentence_start = 0 for sentence in sentences: for token in sentence: start = subtext.find(token) position = position + start token_list.append(Annotation(**{"begin": position, "end": position + len(token), "features": {}})) subtext = subtext[start + len(token) :] position = position + len(token) sentence_list.append(Annotation(**{"begin": sentence_start, "end": position, "features": {}})) sentence_start = position + 1 doc = Document(text=text, annotations={"t.token": token_list, "t.sentence": sentence_list}, version=0) return doc
[docs]def build_token_labeling_response(original_doc: Document, labels: List[str] = None, version: int = 0) -> Document: annotated_doc = copy.deepcopy(original_doc) annotated_doc.version = version assert AnnotationTypes.TOKEN.value in original_doc.annotations assert AnnotationTypes.SENTENCE.value in original_doc.annotations assert len(original_doc.annotations["t.token"]) == len(labels) annotations = Annotations.from_dict(annotated_doc.text, annotated_doc.annotations) for token, label in zip(original_doc.annotations["t.token"], labels): annotations.create_annotation( AnnotationTypes.ANNOTATION.value, token.begin, token.end, {AnnotationFeatures.VALUE.value: label}, ) annotated_doc.annotations = annotations.get_annotations() return annotated_doc
[docs]def build_span_classification_response_per_sentence( original_doc: Document, spans: List[List[Span]] = None, version: int = 0 ) -> Document: annotated_doc = copy.deepcopy(original_doc) annotated_doc.version = version assert AnnotationTypes.TOKEN.value in original_doc.annotations assert AnnotationTypes.SENTENCE.value in original_doc.annotations annotations = Annotations.from_dict(annotated_doc.text, annotated_doc.annotations) sentences = annotations.select(AnnotationTypes.SENTENCE.value) assert len(sentences) == len(spans) for sentence, cur_spans in zip(sentences, spans): tokens = annotations.select_covered(AnnotationTypes.TOKEN.value, sentence) for span in cur_spans: first_token = tokens[span.begin] last_token = tokens[span.end - 1] annotations.create_annotation( AnnotationTypes.ANNOTATION.value, first_token.begin, last_token.end, {AnnotationFeatures.VALUE.value: span.value}, ) annotated_doc.annotations = annotations.get_annotations() return annotated_doc