Source code for pyenzyme.enzymeml.core.protein

# File: protein.py
# Project: core
# Author: Jan Range
# License: BSD-2 clause
# Copyright (c) 2022 Institute of Biochemistry and Technical Biochemistry Stuttgart

import re

from pydantic import validator, Field
from typing import Dict, Optional, TYPE_CHECKING, Any
from dataclasses import dataclass

from pyenzyme.enzymeml.core.ontology import SBOTerm
from pyenzyme.enzymeml.core.enzymemlbase import EnzymeMLBase
from pyenzyme.enzymeml.core.exceptions import UniProtIdentifierError
from pyenzyme.enzymeml.core.abstract_classes import AbstractSpecies
from pyenzyme.enzymeml.core.utils import type_checking, deprecated_getter

if TYPE_CHECKING:  # pragma: no cover
    static_check_init_args = dataclass
else:
    static_check_init_args = type_checking


[docs]@static_check_init_args
class Protein(EnzymeMLBase, AbstractSpecies):

    name: Optional[str] = Field(
        None, description="Name of the protein", template_alias="Name"
    )

    sequence: Optional[str] = Field(
        None,
        description="Amino acid sequence of the protein",
        template_alias="Sequence",
    )

    vessel_id: str = Field(
        ...,
        description="Identifier of the vessel in which the protein was stored.",
        template_alias="Vessel",
        regex=r"v[\d.]+",
    )

    init_conc: Optional[float] = Field(
        default=None,
        description="Initial concentration of the protein.",
    )

    unit: Optional[str] = Field(
        None,
        description="Unit of the proteins intial concentration.",
    )

    constant: bool = Field(
        True,
        description="Whether the proteins concentration remains constant or not.",
        template_alias="Constant",
    )

    id: Optional[str] = Field(
        None,
        description="Unique identifier of the protein.",
        template_alias="ID",
        regex=r"p[\d]+",
    )

    meta_id: Optional[str] = Field(
        None,
        description="Unique meta identifier of the protein.",
    )

    ecnumber: Optional[str] = Field(
        None,
        description="EC number of the protein.",
        template_alias="EC Number",
        regex=r"(\d+.)(\d+.)(\d+.)(\d+)",
    )

    organism: Optional[str] = Field(
        None,
        description="Organism the protein was expressed in.",
        template_alias="Source organism",
    )

    organism_tax_id: Optional[str] = Field(
        None,
        description="Taxonomy identifier of the expression host.",
    )

    boundary: bool = Field(
        False,
        description="Whether the protein is under any boundary conditions (SBML Technicality, better leave it to default)",
    )

    ontology: SBOTerm = Field(
        SBOTerm.PROTEIN,
        description="Ontology describing the characteristic of the protein.",
    )

    uri: Optional[str] = Field(
        None,
        description="URI of the protein.",
    )

    creator_id: Optional[str] = Field(
        None,
        description="Unique identifier of the author.",
    )

    uniprotid: Optional[str] = Field(
        None,
        description="Unique identifier referencing a protein entry at UniProt. Use this identifier to initialize the object from the UniProt database.",
        template_alias="UniProt ID",
    )

    # ! Validators
[docs]    @validator("id")
    def set_meta_id(cls, id: Optional[str], values: dict):
        """Sets the meta ID when an ID is provided"""

        if id:
            # Set Meta ID with ID
            values["meta_id"] = f"METAID_{id.upper()}"

        return id

[docs]    @validator("sequence")
    def clean_sequence(cls, sequence):
        """Cleans a sequence from whitespaces as well as newlines and transforms uppercase"""

        if sequence:
            return re.sub(r"\s+", "", sequence).upper()
        else:
            return sequence

    # ! Initializers
[docs]    @classmethod
    def fromUniProtID(
        cls,
        uniprotid: str,
        vessel_id: str,
        init_conc: Optional[float] = None,
        unit: Optional[str] = None,
        constant: bool = False,
    ) -> "Protein":
        """Initializes a protein based on the UniProt database.

        Raises:
            UniProtIdentifierError: Raised when the UniProt identifier is invalid.

        Returns:
            Protein: The initialiized Protein object.
        """

        # Get UniProt Parameters
        parameters = cls._getUniProtParameters(uniprotid=uniprotid)

        return cls(
            init_conc=init_conc,
            unit=unit,
            vessel_id=vessel_id,
            constant=constant,
            uniprotid=uniprotid,
            **parameters,
        )

    @staticmethod
    def _getUniProtParameters(uniprotid: str) -> Dict[str, Any]:
        import requests
        import xml.etree.ElementTree as ET

        # Send request to CHEBI database
        endpoint = f"https://www.uniprot.org/uniprot/{uniprotid}.xml"

        # Fetch data
        response = requests.get(endpoint)

        # Check if the UniProt ID is correct
        if response.status_code == 404:
            raise UniProtIdentifierError(uniprotid=uniprotid)

        # Create XML Tree
        tree = ET.ElementTree(ET.fromstring(response.text))

        # Set prefix to match tag
        prefix = r"{http://uniprot.org/uniprot}"

        # Define mapping for the used attributes
        attribute_mapping = {
            prefix + "sequence": "sequence",
            prefix + "fullName": "name",
            prefix + "ecNumber": "ecnumber",
        }

        # Collect parameters
        parameters = {}
        for elem in tree.iter():
            if elem.tag in attribute_mapping and parameters.get(elem.tag) is None:
                parameters[attribute_mapping[elem.tag]] = elem.text

        return parameters

    # ! Getters
[docs]    @deprecated_getter("organism_tax_id")
    def getOrganismTaxId(self):
        return self.organism_tax_id

[docs]    @deprecated_getter("ecnumber")
    def getEcnumber(self):
        return self.ecnumber

[docs]    @deprecated_getter("uniprotid")
    def getUniprotID(self):
        return self.uniprotid

[docs]    @deprecated_getter("organism")
    def getOrganism(self):
        return self.organism

[docs]    @deprecated_getter("init_conc")
    def getInitConc(self):
        return self.init_conc

[docs]    @deprecated_getter("name")
    def getName(self):
        return self.name

[docs]    @deprecated_getter("id")
    def getId(self):
        return self.id

[docs]    @deprecated_getter("meta_id")
    def getMetaid(self):
        return self.meta_id

[docs]    @deprecated_getter("sequence")
    def getSequence(self):
        return self.sequence

[docs]    @deprecated_getter("ontology")
    def getSboterm(self):
        return self.ontology

[docs]    @deprecated_getter("vessel_id")
    def getVessel(self):
        return self.vessel_id

[docs]    @deprecated_getter("unit")
    def getSubstanceUnits(self):
        return self.unit

[docs]    @deprecated_getter("boundary")
    def getBoundary(self):
        return self.boundary

[docs]    @deprecated_getter("constant")
    def getConstant(self):
        return self.constant