Schema

`AggregationFunction` ¶

Bases: str, Enum

Enum of supported aggregation functions

Source code in src/autolabel/schema.py

class AggregationFunction(str, Enum):
    """Enum of supported aggregation functions"""

    MAX = "max"
    MEAN = "mean"

`ConfidenceCacheEntry` ¶

Bases: BaseModel

Source code in src/autolabel/schema.py

class ConfidenceCacheEntry(BaseModel):
    prompt: Optional[str] = ""
    raw_response: Optional[str] = ""
    logprobs: Optional[list] = None
    score_type: Optional[str] = "logprob_average"
    creation_time_ms: Optional[int] = -1
    ttl_ms: Optional[int] = -1

    class Config:
        orm_mode = True

    def get_id(self) -> str:
        """
        Generates a unique ID for the given confidence cache configuration
        """
        return calculate_md5([self.prompt, self.raw_response, self.score_type])

    def get_serialized_output(self) -> str:
        """
        Returns the serialized cache entry output
        """
        return json.dumps(self.logprobs)

    def deserialize_output(self, output: str) -> Dict[str, float]:
        """
        Deserializes the cache entry output
        """
        return json.loads(output)

`deserialize_output(output)` ¶

Deserializes the cache entry output

Source code in src/autolabel/schema.py

def deserialize_output(self, output: str) -> Dict[str, float]:
    """
    Deserializes the cache entry output
    """
    return json.loads(output)

`get_id()` ¶

Generates a unique ID for the given confidence cache configuration

Source code in src/autolabel/schema.py

def get_id(self) -> str:
    """
    Generates a unique ID for the given confidence cache configuration
    """
    return calculate_md5([self.prompt, self.raw_response, self.score_type])

`get_serialized_output()` ¶

Returns the serialized cache entry output

Source code in src/autolabel/schema.py

def get_serialized_output(self) -> str:
    """
    Returns the serialized cache entry output
    """
    return json.dumps(self.logprobs)

`Dataset` ¶

Bases: BaseModel

Contains Dataset parameters, including input file path, indexes for state management (e.g. job batching and retries), and a unique ID

Source code in src/autolabel/schema.py

class Dataset(BaseModel):
    """Contains Dataset parameters, including input file path, indexes for state management (e.g. job batching and retries), and a unique ID"""

    id: str
    input_file: str
    start_index: int
    end_index: int

    class Config:
        orm_mode = True

    @classmethod
    def create_id(
        self,
        dataset: Union[str, pd.DataFrame],
        config: AutolabelConfig,
        start_index: int,
        max_items: int,
    ) -> str:
        """
        Generates a unique ID for the given Dataset configuration
        Args:
            dataset: either 1) input file name or 2) pandas Dataframe
            config:  AutolabelConfig object containing project settings
            start_index: index to begin labeling job at (used for job batching, retries, state management)
            max_items: number of data points to label, beginning at start_index

        Returns:
            filehash: a unique ID generated from an MD5 hash of the functions parameters
        """
        if isinstance(dataset, str):
            filehash = calculate_md5(
                [open(dataset, "rb"), config._dataset_config, start_index, max_items]
            )
        else:
            filehash = calculate_md5(
                [dataset.to_csv(), config._dataset_config, start_index, max_items]
            )
        return filehash

`create_id(dataset, config, start_index, max_items)` `classmethod` ¶

Generates a unique ID for the given Dataset configuration Args: dataset: either 1) input file name or 2) pandas Dataframe config: AutolabelConfig object containing project settings start_index: index to begin labeling job at (used for job batching, retries, state management) max_items: number of data points to label, beginning at start_index

Returns:

Name	Type	Description
`filehash`	`str`	a unique ID generated from an MD5 hash of the functions parameters

Source code in src/autolabel/schema.py

@classmethod
def create_id(
    self,
    dataset: Union[str, pd.DataFrame],
    config: AutolabelConfig,
    start_index: int,
    max_items: int,
) -> str:
    """
    Generates a unique ID for the given Dataset configuration
    Args:
        dataset: either 1) input file name or 2) pandas Dataframe
        config:  AutolabelConfig object containing project settings
        start_index: index to begin labeling job at (used for job batching, retries, state management)
        max_items: number of data points to label, beginning at start_index

    Returns:
        filehash: a unique ID generated from an MD5 hash of the functions parameters
    """
    if isinstance(dataset, str):
        filehash = calculate_md5(
            [open(dataset, "rb"), config._dataset_config, start_index, max_items]
        )
    else:
        filehash = calculate_md5(
            [dataset.to_csv(), config._dataset_config, start_index, max_items]
        )
    return filehash

`ErrorType` ¶

Bases: str, Enum

Enum of supported error types

Source code in src/autolabel/schema.py

class ErrorType(str, Enum):
    """Enum of supported error types"""

    LLM_PROVIDER_ERROR = "llm_provider_error"
    PARSING_ERROR = "parsing_error"
    OUTPUT_GUIDELINES_NOT_FOLLOWED_ERROR = "output_guidelines_not_followed_error"
    EMPTY_RESPONSE_ERROR = "empty_response_error"

`FewShotAlgorithm` ¶

Bases: str, Enum

Enum of supported algorithms for choosing which examples to provide the LLM in its instruction prompt

Source code in src/autolabel/schema.py

class FewShotAlgorithm(str, Enum):
    """Enum of supported algorithms for choosing which examples to provide the LLM in its instruction prompt"""

    FIXED = "fixed"
    SEMANTIC_SIMILARITY = "semantic_similarity"
    MAX_MARGINAL_RELEVANCE = "max_marginal_relevance"
    LABEL_DIVERSITY_RANDOM = "label_diversity_random"
    LABEL_DIVERSITY_SIMILARITY = "label_diversity_similarity"

`GenerationCacheEntry` ¶

Bases: BaseModel

Source code in src/autolabel/schema.py

class GenerationCacheEntry(BaseModel):
    model_name: str
    prompt: str
    model_params: str
    generations: Optional[List[Union[Generation, ChatGeneration]]] = None
    creation_time_ms: Optional[int] = -1
    ttl_ms: Optional[int] = -1

    class Config:
        orm_mode = True

    def get_id(self) -> str:
        """
        Generates a unique ID for the given generation cache configuration
        """
        return calculate_md5([self.model_name, self.model_params, self.prompt])

    def get_serialized_output(self) -> str:
        """
        Returns the serialized cache entry output
        """
        return json.dumps([gen.dict() for gen in self.generations])

    def deserialize_output(
        self, output: str
    ) -> List[Union[Generation, ChatGeneration]]:
        """
        Deserializes the cache entry output
        """
        generations = [
            Generation(**gen) if gen["type"] == "Generation" else ChatGeneration(**gen)
            for gen in json.loads(output)
        ]
        return generations

`deserialize_output(output)` ¶

Deserializes the cache entry output

Source code in src/autolabel/schema.py

def deserialize_output(
    self, output: str
) -> List[Union[Generation, ChatGeneration]]:
    """
    Deserializes the cache entry output
    """
    generations = [
        Generation(**gen) if gen["type"] == "Generation" else ChatGeneration(**gen)
        for gen in json.loads(output)
    ]
    return generations

`get_id()` ¶

Generates a unique ID for the given generation cache configuration

Source code in src/autolabel/schema.py

def get_id(self) -> str:
    """
    Generates a unique ID for the given generation cache configuration
    """
    return calculate_md5([self.model_name, self.model_params, self.prompt])

`get_serialized_output()` ¶

Returns the serialized cache entry output

Source code in src/autolabel/schema.py

def get_serialized_output(self) -> str:
    """
    Returns the serialized cache entry output
    """
    return json.dumps([gen.dict() for gen in self.generations])

`LLMAnnotation` ¶

Bases: BaseModel

Contains label information of a given data point, including the generated label, the prompt given to the LLM, and the LLMs response. Optionally includes a confidence_score if supported by the model

Source code in src/autolabel/schema.py

class LLMAnnotation(BaseModel):
    """Contains label information of a given data point, including the generated label, the prompt given to the LLM, and the LLMs response. Optionally includes a confidence_score if supported by the model"""

    successfully_labeled: bool
    label: Any
    curr_sample: Optional[bytes] = ""
    confidence_score: Optional[float] = None
    generation_info: Optional[Dict[str, Any]] = None
    raw_response: Optional[str] = ""
    explanation: Optional[str] = ""
    prompt: Optional[str] = ""
    confidence_prompt: Optional[str] = ""
    input_tokens: Optional[int] = None
    output_tokens: Optional[int] = None
    cost: Optional[float] = None
    latency: Optional[float] = None
    error: Optional[LabelingError] = None

`LabelingError` ¶

Bases: BaseModel

Contains information about an error that occurred during the labeling process

Source code in src/autolabel/schema.py

class LabelingError(BaseModel):
    """Contains information about an error that occurred during the labeling process"""

    error_type: ErrorType
    error_message: str

`MetricResult` ¶

Bases: BaseModel

Contains performance metrics gathered from autolabeler runs

Source code in src/autolabel/schema.py

class MetricResult(BaseModel):
    """Contains performance metrics gathered from autolabeler runs"""

    name: str
    value: Any
    show_running: Optional[bool] = True

`MetricType` ¶

Bases: str, Enum

Enum of supported performance metrics. Some metrics are always available (task agnostic), while others are only supported by certain types of tasks

Source code in src/autolabel/schema.py

class MetricType(str, Enum):
    """Enum of supported performance metrics. Some metrics are always available (task agnostic), while others are only supported by certain types of tasks"""

    # Task agnostic
    SUPPORT = "support"
    COMPLETION_RATE = "completion_rate"
    # Classification metrics
    ACCURACY = "accuracy"
    CONFUSION_MATRIX = "confusion_matrix"
    LABEL_DISTRIBUTION = "label_distribution"
    F1 = "f1"
    F1_MICRO = "f1_micro"
    F1_MACRO = "f1_macro"
    F1_WEIGHTED = "f1_weighted"
    TEXT_PARTIAL_MATCH = "text_partial_match"
    # Token Classification metrics
    F1_EXACT = "f1_exact"
    F1_STRICT = "f1_strict"
    F1_PARTIAL = "f1_partial"
    F1_ENT_TYPE = "f1_ent_type"
    # Confidence metrics
    AUROC = "auroc"
    THRESHOLD = "threshold"

    # Aggregate Metrics
    CLASSIFICATION_REPORT = "classification_report"

`ModelProvider` ¶

Bases: str, Enum

Enum containing all LLM providers currently supported by autolabeler

Source code in src/autolabel/schema.py

class ModelProvider(str, Enum):
    """Enum containing all LLM providers currently supported by autolabeler"""

    OPENAI = "openai"
    OPENAI_VISION = "openai_vision"
    ANTHROPIC = "anthropic"
    HUGGINGFACE_PIPELINE = "huggingface_pipeline"
    HUGGINGFACE_PIPELINE_VISION = "huggingface_pipeline_vision"
    REFUEL = "refuel"
    GOOGLE = "google"
    COHERE = "cohere"
    CUSTOM = "custom"

`RefuelLLMResult` ¶

Bases: BaseModel

List of generated outputs. This is a List[List[]] because each input could have multiple candidate generations.

Source code in src/autolabel/schema.py

class RefuelLLMResult(BaseModel):
    """List of generated outputs. This is a List[List[]] because
    each input could have multiple candidate generations."""

    generations: List[List[Union[Generation, ChatGeneration]]]

    """Errors encountered while running the labeling job"""
    errors: List[Optional[LabelingError]]

    """Costs incurred during the labeling job"""
    costs: Optional[List[float]] = []

    """Latencies incurred during the labeling job"""
    latencies: Optional[List[float]] = []

`costs: Optional[List[float]] = []` `class-attribute` `instance-attribute` ¶

Latencies incurred during the labeling job

`errors: List[Optional[LabelingError]]` `instance-attribute` ¶

Costs incurred during the labeling job

`generations: List[List[Union[Generation, ChatGeneration]]]` `instance-attribute` ¶

Errors encountered while running the labeling job

`TaskType` ¶

Bases: str, Enum

Enum containing all the types of tasks that autolabeler currently supports

Source code in src/autolabel/schema.py

class TaskType(str, Enum):
    """Enum containing all the types of tasks that autolabeler currently supports"""

    CLASSIFICATION = "classification"
    NAMED_ENTITY_RECOGNITION = "named_entity_recognition"
    QUESTION_ANSWERING = "question_answering"
    ENTITY_MATCHING = "entity_matching"
    MULTILABEL_CLASSIFICATION = "multilabel_classification"
    ATTRIBUTE_EXTRACTION = "attribute_extraction"

Schema

AggregationFunction ¶

ConfidenceCacheEntry ¶

deserialize_output(output) ¶

get_id() ¶

get_serialized_output() ¶

Dataset ¶

create_id(dataset, config, start_index, max_items) classmethod ¶

ErrorType ¶

FewShotAlgorithm ¶

GenerationCacheEntry ¶

deserialize_output(output) ¶

get_id() ¶

get_serialized_output() ¶

LLMAnnotation ¶

LabelingError ¶

MetricResult ¶

MetricType ¶

ModelProvider ¶

RefuelLLMResult ¶

costs: Optional[List[float]] = [] class-attribute instance-attribute ¶

errors: List[Optional[LabelingError]] instance-attribute ¶

generations: List[List[Union[Generation, ChatGeneration]]] instance-attribute ¶

TaskType ¶

`AggregationFunction` ¶

`ConfidenceCacheEntry` ¶

`deserialize_output(output)` ¶

`get_id()` ¶

`get_serialized_output()` ¶

`Dataset` ¶

`create_id(dataset, config, start_index, max_items)` `classmethod` ¶

`ErrorType` ¶

`FewShotAlgorithm` ¶

`GenerationCacheEntry` ¶

`deserialize_output(output)` ¶

`get_id()` ¶

`get_serialized_output()` ¶

`LLMAnnotation` ¶

`LabelingError` ¶

`MetricResult` ¶

`MetricType` ¶

`ModelProvider` ¶

`RefuelLLMResult` ¶

`costs: Optional[List[float]] = []` `class-attribute` `instance-attribute` ¶

`errors: List[Optional[LabelingError]]` `instance-attribute` ¶

`generations: List[List[Union[Generation, ChatGeneration]]]` `instance-attribute` ¶

`TaskType` ¶