Register files from Census release 2023-07-25¶

import lamindb as ln
import lnschema_bionty as lb

# import cellxgene_census
import pandas as pd

💡 lamindb instance: laminlabs/cellxgene

ln.track()

💡 notebook imports: lamindb==0.67.2 lnschema_bionty==0.39.0 pandas==2.1.4 requests==2.31.0
💡 loaded: Transform(uid='pNa7RdI26sp45zKv', name='Register files from Census release 2023-07-25', short_name='census-release-2023-07-25', version='1', type='notebook', updated_at=2024-01-27 05:27:26 UTC, created_by_id=1)
💡 loaded: Run(uid='dJ9t75LeOeqYWA4B0WbA', run_at=2024-01-30 09:03:47 UTC, transform_id=18, created_by_id=1)

census_version = "2023-07-25"  # LTS release of Census

Register collections (updated 2024-01-27)¶

artifacts = ln.Artifact.filter(version=census_version).all()
artifacts.count()

collection = ln.Collection(artifacts, name="cellxgene-census", version=census_version)
collection.save()

collections = ln.Collection.filter(version=census_version).all()
collections.count()

Register datasets¶

Get the h5ad files directory on s3 from Census:

h5ad_dir = (
    cellxgene_census.get_census_version_directory()
    .get("stable")
    .get("h5ads")
    .get("uri")
)
h5ad_dir

's3://cellxgene-data-public/cell-census/2023-07-25/h5ads/'

ln.UPath(h5ad_dir).view_tree()

 (0 sub-directories & 850 files with suffixes '.h5ad'): 
├── 00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad
├── 0041b9c3-6a49-4bf7-8514-9bc7190067a7.h5ad
├── 00476f9f-ebc1-4b72-b541-32f912ce36ea.h5ad
├── 00e5dedd-b9b7-43be-8c28-b0e5c6414a62.h5ad
├── 00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad
├── 01209dce-3575-4bed-b1df-129f57fbc031.h5ad
...

files = ln.File.from_dir("s3://cellxgene-data-public/cell-census/2023-07-25/h5ads")
ln.save(files)

dataset = ln.Dataset(files, name="cellxgene-census", version=census_version)
dataset.save()

dataset = ln.Dataset.filter(name="cellxgene-census", version=census_version).one()
files = dataset.files.all()

Register metadata¶

Get all datasets and associated metadata using cellxgene REST API:

import requests


def get_metadata_from_cxg():
    api_url_base = "https://api.cellxgene.cziscience.com"
    datasets_path = "/curation/v1/datasets"
    datasets_url = f"{api_url_base}{datasets_path}"
    headers = {"Content-Type": "application/json"}
    res = requests.get(url=datasets_url, headers=headers)
    res.raise_for_status()
    cellxgene_meta = res.json()
    return cellxgene_meta

cellxgene_meta = get_metadata_from_cxg()
len(cellxgene_meta)

cellxgene_meta[0].keys()

dict_keys(['assay', 'assets', 'cell_count', 'cell_type', 'collection_doi', 'collection_id', 'collection_name', 'collection_version_id', 'dataset_id', 'dataset_version_id', 'development_stage', 'disease', 'donor_id', 'explorer_url', 'is_primary_data', 'mean_genes_per_cell', 'organism', 'primary_cell_count', 'processing_status', 'published_at', 'revised_at', 'schema_version', 'self_reported_ethnicity', 'sex', 'suspension_type', 'tissue', 'title', 'tombstone', 'x_approximate_distribution'])

features¶

obs_features = {
    "assay": "bionty.ExperimentalFactor",
    "cell_type": "bionty.CellType",
    "development_stage": "bionty.DevelopmentalStage",
    "disease": "bionty.Disease",
    "donor_id": "core.ULabel",
    "self_reported_ethnicity": "bionty.Ethnicity",
    "sex": "bionty.Phenotype",
    "suspension_type": "core.ULabel",
    "tissue": "bionty.Tissue",
}

obs_features_records = []
for name, registry in obs_features.items():
    record = ln.Feature(name=name, type="category", registries=registry)
    obs_features_records.append(record)
ln.save(obs_features_records)
obs_feature_set = ln.FeatureSet(features=obs_features_records, name="obs features")
obs_feature_set.save()
obs_feature_set.files.set(files, through_defaults={"slot": "obs"})

ext_features = {"organism": "bionty.Organism", "collection": "core.ULabel"}

ext_features_records = []
for name, registry in ext_features.items():
    record = ln.Feature(name=name, type="category", registries=registry)
    ext_features_records.append(record)
ln.save(ext_features_records)
ext_feature_set = ln.FeatureSet(features=ext_features_records, name="external features")
ext_feature_set.save()
ext_feature_set.files.set(files, through_defaults={"slot": "external"})

collections, organisms¶

Register collections:

is_collection = ln.ULabel(name="is_collection")
is_collection.save()

collections_meta = set()
for dataset_meta in cellxgene_meta:
    collections_meta.add(
        (
            dataset_meta["collection_name"],
            dataset_meta["collection_doi"],
            dataset_meta["collection_id"],
        )
    )

collections_records = []
for collection_name, collection_doi, collection_id in collections_meta:
    collection = ln.ULabel(
        name=collection_name,
        description=collection_doi,
        reference=collection_id,
        reference_type="collection_id",
    )
    collections_records.append(collection)
ln.save(collections_records)
is_collection.children.add(*collections_records)

Register organisms:

ncbitaxon_source = lb.BiontySource.filter(source="ncbitaxon").one()

organisms_meta = set()
for dataset_meta in cellxgene_meta:
    organisms_meta.update({i["ontology_term_id"] for i in dataset_meta["organism"]})

organisms_records = lb.Organism.from_values(
    organisms_meta, field=lb.Organism.ontology_id, bionty_source=ncbitaxon_source
)
# rename house mouse to mouse
for r in organisms_records:
    if r.name == "house mouse":
        r.name = "mouse"
ln.save(organisms_records, parents=False)

Annotate files with collections and organisms:

ext_features = ext_feature_set.members.lookup()
files = dataset.files.all()
collections = is_collection.children.all()
organisms = lb.Organism.filter().all()

for dataset_meta in cellxgene_meta:
    # get registered file record based on dataset_id
    file = files.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
    if file is None:
        continue

    # register collection
    collection = ln.ULabel.filter(reference=dataset_meta["collection_id"]).one()
    file.labels.add(collection, feature=ext_features.collection)

    # register organism
    organism_ontology_ids = [i["ontology_term_id"] for i in dataset_meta["organism"]]
    organism_records = lb.Organism.filter(ontology_id__in=organism_ontology_ids).list()
    file.labels.add(organism_records, feature=ext_features.organism)

ontologies¶

Register all ontology ids:

from typing import Optional
from lnschema_bionty.models import Registry
from lamindb.dev._feature_manager import get_accessor_by_orm

obs_features_records = obs_feature_set.members.lookup()
ACCESSORS = get_accessor_by_orm(ln.File)
FEATURE_TO_ACCESSOR = {}
for name in obs_features.keys():
    feature = getattr(obs_features_records, name)
    accessor = ACCESSORS.get(feature.registries)
    orm = getattr(ln.File, accessor).field.model
    # TODO: ulabels are defined in the File model, improve this in LaminDB
    if orm == ln.File:
        orm = getattr(ln.File, accessor).field.related_model
    FEATURE_TO_ACCESSOR[name] = (accessor, orm)


def create_ontology_record_from_source(
    ontology_id: str,
    from_orm: Registry,
    target_orm: Registry,
    bionty_source: Optional[lb.BiontySource] = None,
):
    from_record = from_orm.from_bionty(
        ontology_id=ontology_id, bionty_source=bionty_source
    )
    try:
        target_record = target_orm(
            name=from_record.name,
            description=from_record.description,
            ontology_id=from_record.ontology_id,
            bionty_source_id=from_record.bionty_source_id,
        )
        return target_record
    except Exception:
        pass

ln.settings.upon_create_search_names = False

ontology_ids = {}
for name in obs_features.keys():
    if name in ["donor_id", "suspension_type"]:
        continue
    allids = set()
    for i in cellxgene_meta:
        if name in i:
            allids.update([(j["label"], j["ontology_term_id"]) for j in i[name]])

    ontology_ids[name] = allids

bionty_source_ds_mouse = lb.BiontySource.filter(
    entity="DevelopmentalStage", organism="mouse"
).one()
bionty_source_pato = lb.BiontySource.filter(source="pato").one()

# register all ontology ids
for name, terms in ontology_ids.items():
    print(f"registering {name}")
    accessor, orm = FEATURE_TO_ACCESSOR.get(name)
    terms_ids = [i[1] for i in terms]
    records = orm.from_values(terms_ids, field="ontology_id")
    if len(records) > 0:
        ln.save(records)
    inspect_result = orm.inspect(terms_ids, field="ontology_id", mute=True)
    if len(inspect_result.non_validated) > 0:
        if name == "development_stage":
            records = orm.from_values(
                inspect_result.non_validated,
                field="ontology_id",
                bionty_source=bionty_source_ds_mouse,
            )
            records += [
                create_ontology_record_from_source(
                    ontology_id=term_id, from_orm=lb.Tissue, target_orm=orm
                )
                for term_id in inspect_result.non_validated
                if term_id.startswith("UBERON:")
            ]
            records += [
                orm(name=term_id, ontology_id=term_id)
                for term_id in inspect_result.non_validated
                if term_id == "unknown"
            ]
        else:
            records = [
                orm(name=term[0], ontology_id=term[1])
                for term in terms
                if (not term[1].startswith("PATO:"))
                and (term[1] in inspect_result.non_validated)
            ]
            records += [
                create_ontology_record_from_source(
                    ontology_id=term_id,
                    from_orm=lb.Phenotype,
                    target_orm=orm,
                    bionty_source=bionty_source_pato,
                )
                for term_id in inspect_result.non_validated
                if term_id.startswith("PATO:")
            ]

        if len(records) > 0:
            print(f"registered {len(records)} records: {records}")
            ln.save(records)

Show code cell output Hide code cell output

registering assay
❗ did not create ExperimentalFactor record for 1 non-validated ontology_id: 'EFO:0700016'
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registered 1 records: [ExperimentalFactor(uid='gWUGSA9l', name='Smart-seq v4', ontology_id='EFO:0700016', created_by_id=1)]
registering cell_type
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registering development_stage
❗ did not create DevelopmentalStage records for 6 non-validated ontology_ids: 'UBERON:0018241', 'UBERON:0000113', 'UBERON:0034919', 'UBERON:0007220', 'UBERON:0007222', 'unknown'
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registered 6 records: [DevelopmentalStage(uid='wksJWjer', name='prime adult stage', ontology_id='UBERON:0018241', description='A Life Cycle Stage That Starts At Completion Of Development And Growth Of The Sexually Mature Adult Animal, And Ends Before Senescence.', bionty_source_id=47, created_by_id=1), DevelopmentalStage(uid='GDaE3j6Z', name='post-juvenile adult stage', ontology_id='UBERON:0000113', description='The Stage Of Being A Sexually Mature Adult Animal.', bionty_source_id=47, created_by_id=1), DevelopmentalStage(uid='l00DTC4g', name='juvenile stage', ontology_id='UBERON:0034919', description='The Stage Of Being No More Dependent Of The Nest And/Or From Caregivers For Subsistence While Having Not Reach Sexual Maturity.', bionty_source_id=47, created_by_id=1), DevelopmentalStage(uid='lNh8U4YZ', name='late embryonic stage', ontology_id='UBERON:0007220', description='An Embryo Stage That Covers Late Steps Of The Embryogenesis With A Fully Formed Embryo Still Developing Before Birth Or Egg Hatching.', bionty_source_id=47, created_by_id=1), DevelopmentalStage(uid='hqyIKjfF', name='late adult stage', ontology_id='UBERON:0007222', bionty_source_id=47, created_by_id=1), DevelopmentalStage(uid='xL8yuEN7', name='unknown', ontology_id='unknown', created_by_id=1)]
registering disease
❗ did not create Disease record for 1 non-validated ontology_id: 'PATO:0000461'
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registered 1 records: [Disease(uid='4r2nqggf', name='normal', ontology_id='PATO:0000461', description='A Quality Inhering In A Bearer By Virtue Of The Bearer'S Exhibiting No Deviation From Normal Or Average.', bionty_source_id=38, created_by_id=1)]
registering self_reported_ethnicity
❗ did not create Ethnicity records for 3 non-validated ontology_ids: 'multiethnic', 'na', 'unknown'
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registered 3 records: [Ethnicity(uid='xL8yuEN7', name='unknown', ontology_id='unknown', created_by_id=1), Ethnicity(uid='UY1fNAFT', name='na', ontology_id='na', created_by_id=1), Ethnicity(uid='8lAgy5Ej', name='multiethnic', ontology_id='multiethnic', created_by_id=1)]
registering sex
❗ did not create Phenotype records for 3 non-validated ontology_ids: 'PATO:0000384', 'unknown', 'PATO:0000383'
registered 3 records: [Phenotype(uid='xL8yuEN7', name='unknown', ontology_id='unknown', created_by_id=1), Phenotype(uid='Pl1UiuS0', name='male', ontology_id='PATO:0000384', description='A Biological Sex Quality Inhering In An Individual Or A Population Whose Sex Organs Contain Only Male Gametes.', bionty_source_id=38, created_by_id=1), Phenotype(uid='hSl0sSF0', name='female', ontology_id='PATO:0000383', description='A Biological Sex Quality Inhering In An Individual Or A Population That Only Produces Gametes That Can Be Fertilised By Male Gametes.', bionty_source_id=38, created_by_id=1)]
registering tissue
❗ did not create Tissue records for 18 non-validated ontology_ids: 'CL:0000010 (cell culture)', 'CL:0000082 (cell culture)', 'CL:0000084 (cell culture)', 'CL:0000115 (cell culture)', 'CL:0000351 (cell culture)', 'CL:0002322 (cell culture)', 'CL:0002327 (cell culture)', 'CL:0002328 (cell culture)', 'CL:0002334 (cell culture)', 'CL:0002335 (cell culture)', 'CL:0002633 (cell culture)', 'CL:0010003 (cell culture)', 'UBERON:0000088 (organoid)', 'UBERON:0000310 (organoid)', 'UBERON:0000966 (organoid)', 'UBERON:0001295 (organoid)', 'UBERON:0002048 (organoid)', 'UBERON:0002370 (organoid)'
❗ now recursing through parents: this only happens once, but is much slower than bulk saving
registered 18 records: [Tissue(uid='x3tRcugV', name='trophoblast (organoid)', ontology_id='UBERON:0000088 (organoid)', created_by_id=1), Tissue(uid='UoElNxsj', name='endothelial cell (cell culture)', ontology_id='CL:0000115 (cell culture)', created_by_id=1), Tissue(uid='9YB5clqY', name='cultured cell (cell culture)', ontology_id='CL:0000010 (cell culture)', created_by_id=1), Tissue(uid='WSs6UA9e', name='lung (organoid)', ontology_id='UBERON:0002048 (organoid)', created_by_id=1), Tissue(uid='CevFMDqD', name='preadipocyte (cell culture)', ontology_id='CL:0002334 (cell culture)', created_by_id=1), Tissue(uid='RkE6D8y1', name='endometrium (organoid)', ontology_id='UBERON:0001295 (organoid)', created_by_id=1), Tissue(uid='rIPA0OEl', name='T cell (cell culture)', ontology_id='CL:0000084 (cell culture)', created_by_id=1), Tissue(uid='dwdBlCNp', name='breast (organoid)', ontology_id='UBERON:0000310 (organoid)', created_by_id=1), Tissue(uid='Ash8pGf8', name='trophoblast cell (cell culture)', ontology_id='CL:0000351 (cell culture)', created_by_id=1), Tissue(uid='uS0Cw8zN', name='retina (organoid)', ontology_id='UBERON:0000966 (organoid)', created_by_id=1), Tissue(uid='vg9s890t', name='respiratory basal cell (cell culture)', ontology_id='CL:0002633 (cell culture)', created_by_id=1), Tissue(uid='lfIFQFR5', name='epithelial cell of lung (cell culture)', ontology_id='CL:0000082 (cell culture)', created_by_id=1), Tissue(uid='w6gzNa8D', name='mammary gland epithelial cell (cell culture)', ontology_id='CL:0002327 (cell culture)', created_by_id=1), Tissue(uid='yPk6E1V8', name='epithelial cell of alveolus of lung (cell culture)', ontology_id='CL:0010003 (cell culture)', created_by_id=1), Tissue(uid='K4RSNRBc', name='thymus (organoid)', ontology_id='UBERON:0002370 (organoid)', created_by_id=1), Tissue(uid='9ICArUMH', name='embryonic stem cell (cell culture)', ontology_id='CL:0002322 (cell culture)', created_by_id=1), Tissue(uid='7MzqN14b', name='bronchial epithelial cell (cell culture)', ontology_id='CL:0002328 (cell culture)', created_by_id=1), Tissue(uid='kWD0kb5x', name='brown preadipocyte (cell culture)', ontology_id='CL:0002335 (cell culture)', created_by_id=1)]

donors and suspension_types¶

donor_ids = set()
suspension_types = set()

for i in cellxgene_meta:
    if "donor_id" in i:
        donor_ids.update(i["donor_id"])
    if "suspension_type" in i:
        suspension_types.update(i["suspension_type"])

is_donor = ln.ULabel(name="is_donor", description="parent of donor ids")
is_donor.save()

is_suspension_type = ln.ULabel(
    name="is_suspension_type", description="parent of suspension types"
)
is_suspension_type.save()

is_donor = ln.ULabel.filter(name="is_donor").one()
donors = is_donor.children.all()
result = donors.inspect(donor_ids, mute=True)
new_donors = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_donors)
is_donor.children.add(*new_donors)

is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
stypes = is_suspension_type.children.all()
result = stypes.inspect(suspension_types, mute=True)
new_stypes = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_stypes)
is_suspension_type.children.add(*new_stypes)

Annotate files with metadata¶

features = ln.Feature.lookup()

for idx, dataset_meta in enumerate(cellxgene_meta):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(cellxgene_meta)}")
    file = files.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
    if file is None:
        continue
    for field, terms in dataset_meta.items():
        if field not in FEATURE_TO_ACCESSOR:
            continue
        accessor, orm = FEATURE_TO_ACCESSOR.get(field)
        if field in ["donor_id", "suspension_type"]:
            records = orm.from_values(terms, field="name")
            if len(records) > 0:
                # stratify by feature so that link tables records are written
                file.labels.add(records, feature=getattr(features, field))
        else:
            records = orm.from_values(
                [i["ontology_term_id"] for i in terms], field="ontology_id"
            )
            if len(records) > 0:
                getattr(file, accessor).add(*records)

Validate and register genes¶

# register synthetic constructs and sars_cov_2 as new organisms
lb.Organism.from_bionty(
    ontology_id="NCBITaxon:32630", bionty_source=ncbitaxon_source
).save(parents=False)
lb.Organism.from_bionty(
    ontology_id="NCBITaxon:2697049", bionty_source=ncbitaxon_source
).save(parents=False)

# genes files
organisms = lb.Organism.lookup(field=lb.Organism.scientific_name)
genes_files = {
    "homo_sapiens": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_homo_sapiens.csv.gz",
    "mus_musculus": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_mus_musculus.csv.gz",
    "synthetic_construct": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_ercc.csv.gz",
    "severe_acute_respiratory_syndrome_coronavirus_2": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_sars_cov_2.csv.gz",
}

Register all genes for each organism:

for organism_name, genes_file in genes_files.items():
    print(f"registering {organism_name} genes")
    df = pd.read_csv(genes_file, header=None, index_col=0)
    organism_record = getattr(organisms, organism_name)
    gene_records = lb.Gene.from_values(
        df.index, field=lb.Gene.ensembl_gene_id, organism=organism_record
    )
    ln.save(gene_records)
    validated = lb.Gene.validate(
        df.index, field=lb.Gene.ensembl_gene_id, organism=organism_record
    )
    # register legacy genes manually
    new_records = []
    for gene_id in df.index[~validated]:
        new_records.append(
            lb.Gene(
                ensembl_gene_id=gene_id,
                symbol=df.loc[gene_id][1],
                organism=organism_record,
            )
        )
    ln.save(new_records)

    genes_feature_set = ln.FeatureSet(
        features=gene_records + new_records, name=f"all {organism_record.name} genes"
    )
    genes_feature_set.save()

Show code cell output Hide code cell output

registering homo_sapiens genes
❗ did not create Gene records for 147 non-validated ensembl_gene_ids: 'ENSG00000112096', 'ENSG00000137808', 'ENSG00000161149', 'ENSG00000182230', 'ENSG00000203812', 'ENSG00000204092', 'ENSG00000205485', 'ENSG00000212951', 'ENSG00000215271', 'ENSG00000221995', 'ENSG00000224739', 'ENSG00000224745', 'ENSG00000225178', 'ENSG00000225932', 'ENSG00000226377', 'ENSG00000226380', 'ENSG00000226403', 'ENSG00000227021', 'ENSG00000227220', 'ENSG00000227902', ...
❗ 147 terms (0.20%) are not validated for ensembl_gene_id: ENSG00000269933, ENSG00000261737, ENSG00000259834, ENSG00000256374, ENSG00000263464, ENSG00000203812, ENSG00000272196, ENSG00000272880, ENSG00000284299, ENSG00000270188, ENSG00000287116, ENSG00000237133, ENSG00000224739, ENSG00000227902, ENSG00000239467, ENSG00000272551, ENSG00000280374, ENSG00000284741, ENSG00000236886, ENSG00000229352, ...
registering mus_musculus genes
❗ did not create Gene records for 135 non-validated ensembl_gene_ids: 'ENSMUSG00000022591', 'ENSMUSG00000045506', 'ENSMUSG00000053706', 'ENSMUSG00000053861', 'ENSMUSG00000066378', 'ENSMUSG00000066810', 'ENSMUSG00000066936', 'ENSMUSG00000067085', 'ENSMUSG00000067122', 'ENSMUSG00000067292', 'ENSMUSG00000067627', 'ENSMUSG00000067929', 'ENSMUSG00000068181', 'ENSMUSG00000069518', 'ENSMUSG00000072693', 'ENSMUSG00000073290', 'ENSMUSG00000073291', 'ENSMUSG00000073682', 'ENSMUSG00000074210', 'ENSMUSG00000074302', ...
❗ 135 terms (0.20%) are not validated for ensembl_gene_id: ENSMUSG00000022591, ENSMUSG00000094127, ENSMUSG00000066936, ENSMUSG00000116275, ENSMUSG00000091312, ENSMUSG00000098794, ENSMUSG00000079353, ENSMUSG00000096240, ENSMUSG00000079286, ENSMUSG00000085431, ENSMUSG00000075015, ENSMUSG00000075014, ENSMUSG00000078091, ENSMUSG00000075006, ENSMUSG00000079175, ENSMUSG00000079171, ENSMUSG00000079170, ENSMUSG00000079169, ENSMUSG00000090353, ENSMUSG00000100963, ...
registering synthetic_construct genes
❗ loading non-default source inside a LaminDB instance
❗ no Bionty source found, skipping Bionty validation
❗ loading non-default source inside a LaminDB instance
❗ did not create Gene records for 92 non-validated ensembl_gene_ids: 'ERCC-00002', 'ERCC-00003', 'ERCC-00004', 'ERCC-00009', 'ERCC-00012', 'ERCC-00013', 'ERCC-00014', 'ERCC-00016', 'ERCC-00017', 'ERCC-00019', 'ERCC-00022', 'ERCC-00024', 'ERCC-00025', 'ERCC-00028', 'ERCC-00031', 'ERCC-00033', 'ERCC-00034', 'ERCC-00035', 'ERCC-00039', 'ERCC-00040', ...
❗ 92 terms (100.00%) are not validated for ensembl_gene_id: ERCC-00002, ERCC-00003, ERCC-00004, ERCC-00009, ERCC-00012, ERCC-00013, ERCC-00014, ERCC-00016, ERCC-00017, ERCC-00019, ERCC-00022, ERCC-00024, ERCC-00025, ERCC-00028, ERCC-00031, ERCC-00033, ERCC-00034, ERCC-00035, ERCC-00039, ERCC-00040, ...
registering severe_acute_respiratory_syndrome_coronavirus_2 genes
❗ loading non-default source inside a LaminDB instance
❗ no Bionty source found, skipping Bionty validation
❗ loading non-default source inside a LaminDB instance
❗ did not create Gene records for 12 non-validated ensembl_gene_ids: 'ENSSASG00005000002', 'ENSSASG00005000003', 'ENSSASG00005000004', 'ENSSASG00005000006', 'ENSSASG00005000010', 'ENSSASG00005000007', 'ENSSASG00005000011', 'ENSSASG00005000009', 'ENSSASG00005000012', 'ENSSASG00005000008', 'ENSSASG00005000005', 'ENSSASG00005000013'
❗ 12 terms (100.00%) are not validated for ensembl_gene_id: ENSSASG00005000002, ENSSASG00005000003, ENSSASG00005000004, ENSSASG00005000006, ENSSASG00005000010, ENSSASG00005000007, ENSSASG00005000011, ENSSASG00005000009, ENSSASG00005000012, ENSSASG00005000008, ENSSASG00005000005, ENSSASG00005000013

Link metadata to individual files¶

annotate with genes measured in each file:

for idx, file in enumerate(files):
    if idx % 100 == 0:
        print(f"annotating dataset {idx} of {len(files)}")

    adata_backed = file.backed()
    var_names = adata_backed.var_names
    organism_record = file.organism.first()
    if organism_record is None:
        print(f"No organism found for file: {file}")
        continue
    genes = lb.Gene.from_values(
        var_names, field=lb.Gene.ensembl_gene_id, organism=organism_record
    )

    if len(var_names[var_names.str.startswith("ERCC")]) > 0:
        genes += lb.Gene.from_values(
            var_names,
            field=lb.Gene.ensembl_gene_id,
            organism=organisms.synthetic_construct,
        )
    if len(var_names[var_names.str.startswith("ENSSASG")]) > 0:
        genes += lb.Gene.from_values(
            var_names,
            field=lb.Gene.ensembl_gene_id,
            organism=organisms.severe_acute_respiratory_syndrome_coronavirus_2,
        )

    var_feature_set_file = ln.FeatureSet(genes, type="number")
    var_feature_set_file.save()
    file.feature_sets.add(var_feature_set_file, through_defaults={"slot": "var"})

These files are annotated as rhesus or pig, but using human genes:

for uid in ["Np1PSgWwIIYPWz0USN8z", "PuqnmUwzXQ56VPATgy9b"]:
    file = ln.File.filter(uid=uid).one()
    adata_backed = file.backed()
    var_names = adata_backed.var_names
    genes = lb.Gene.from_values(
        var_names, field=lb.Gene.ensembl_gene_id, organism="human"
    )
    var_feature_set_file = ln.FeatureSet(genes, type="number")
    var_feature_set_file.save()
    file.feature_sets.add(var_feature_set_file, through_defaults={"slot": "var"})

file.describe()

File(uid='PuqnmUwzXQ56VPATgy9b', key='cell-census/2023-07-25/h5ads/db4a9ed2-e994-40c1-b7ec-4091fdf7b6c1.h5ad', suffix='.h5ad', accessor='AnnData', description='A transcriptional cross species map of pancreatic islet cells', size=286688588, hash='HXRDjbTdQSYFOXtU9q09qQ-35', hash_type='md5-n', visibility=1, key_is_virtual=False, updated_at=2023-11-28 22:52:09 UTC)

Provenance:
  🗃️ storage: Storage(uid='oIYGbD74', root='s3://cellxgene-data-public', type='s3', region='us-west-2', updated_at=2023-10-16 15:04:08 UTC, created_by_id=1)
  📔 transform: Transform(uid='pNa7RdI26sp4z8', name='Register files from Census release 2023-07-25', short_name='census-release-2023-07-25', version='0', type='notebook', updated_at=2023-11-28 21:30:25 UTC, created_by_id=1)
  👣 run: Run(uid='ZYgsnqK5v2hPmFlS0kfG', run_at=2023-11-29 10:04:46 UTC, transform_id=11, created_by_id=1)
  👤 created_by: User(uid='kmvZDIX9', handle='sunnyosun', name='Sunny Sun', updated_at=2023-11-28 21:14:48 UTC)
Features:
  obs: FeatureSet(uid='kwKICViF5O3QjHdg0nov', name='obs features', n=9, type='category', registry='core.Feature', hash='Bx10EzvDxdlAVjqVKdKC', updated_at=2023-11-29 09:28:28 UTC, created_by_id=1)
    🔗 assay (1, bionty.ExperimentalFactor): '10x 3' v2'
    🔗 cell_type (4, bionty.CellType): 'pancreatic PP cell', 'type B pancreatic cell', 'pancreatic A cell', 'pancreatic D cell'
    🔗 development_stage (1, bionty.DevelopmentalStage): 'prime adult stage'
    🔗 disease (1, bionty.Disease): 'normal'
    🔗 donor_id (1, core.ULabel): 'pig_donor'
    🔗 self_reported_ethnicity (1, bionty.Ethnicity): 'na'
    🔗 sex (1, bionty.Phenotype): 'female'
    🔗 suspension_type (1, core.ULabel): 'cell'
    🔗 tissue (1, bionty.Tissue): 'islet of Langerhans'
  external: FeatureSet(uid='zIgncie4AywRKgLmKHUW', name='external features', n=2, type='category', registry='core.Feature', hash='5E4xD6tOhDB5EOnLx3tv', updated_at=2023-11-29 09:28:20 UTC, created_by_id=1)
    🔗 organism (1, bionty.Organism): 'domestic pig'
    🔗 collection (1, core.ULabel): 'A transcriptional cross species map of pancreatic islet cells'
  var: FeatureSet(uid='nxOy4SXpndR819ksIxDx', n=15824, type='number', registry='bionty.Gene', hash='gfxllJBvAvyJBu8S2gIF', updated_at=2023-11-29 13:46:53 UTC, created_by_id=1)
    'SAMD11', 'NOC2L', 'KLHL17', 'PLEKHN1', 'PERM1', 'HES4', 'ISG15', 'AGRN', 'TTLL10', 'TNFRSF18', 'TNFRSF4', 'SDF4', 'B3GALT6', 'C1QTNF12', 'UBE2J2', 'SCNN1D', 'ACAP3', 'PUSL1', 'INTS11', 'TAS1R3', ...
Labels:
  🏷️ organism (1, bionty.Organism): 'domestic pig'
  🏷️ tissues (1, bionty.Tissue): 'islet of Langerhans'
  🏷️ cell_types (4, bionty.CellType): 'pancreatic PP cell', 'type B pancreatic cell', 'pancreatic A cell', 'pancreatic D cell'
  🏷️ diseases (1, bionty.Disease): 'normal'
  🏷️ phenotypes (1, bionty.Phenotype): 'female'
  🏷️ experimental_factors (1, bionty.ExperimentalFactor): '10x 3' v2'
  🏷️ developmental_stages (1, bionty.DevelopmentalStage): 'prime adult stage'
  🏷️ ethnicities (1, bionty.Ethnicity): 'na'
  🏷️ ulabels (3, core.ULabel): 'A transcriptional cross species map of pancreatic islet cells', 'pig_donor', 'cell'

Link metadata to dataset¶

feature sets:

dataset.feature_sets.add(
    ln.FeatureSet.filter(name__contains="obs").one(), through_defaults={"slot": "obs"}
)
dataset.feature_sets.add(
    ln.FeatureSet.filter(name__contains="ext").one(),
    through_defaults={"slot": "external"},
)
dataset.feature_sets.add(
    ln.FeatureSet.filter(name__contains="human").one(),
    through_defaults={"slot": "var-human"},
)
dataset.feature_sets.add(
    ln.FeatureSet.filter(name__contains="mouse").one(),
    through_defaults={"slot": "var-mouse"},
)
dataset.feature_sets.add(
    ln.FeatureSet.filter(name__contains="sars-2").one(),
    through_defaults={"slot": "var-sars-cov-2"},
)
dataset.feature_sets.add(
    ln.FeatureSet.filter(name__contains="synthetic construct").one(),
    through_defaults={"slot": "var-ercc"},
)

is_donor = ln.ULabel.filter(name="is_donor").one()
donors = is_donor.children.all().filter().exclude(files=None).all()
is_collection = ln.ULabel.filter(name="is_collection").one()
collections = is_collection.children.all().filter().exclude(files=None).all()
is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
stypes = is_suspension_type.children.all().filter().exclude(files=None).all()

dataset.labels.add(donors, features.donor_id)
dataset.labels.add(collections, features.collection)
dataset.labels.add(stypes, features.suspension_type)

dataset.labels.add(
    lb.ExperimentalFactor.filter().exclude(files=None).all(), features.assay
)
dataset.labels.add(lb.CellType.filter().exclude(files=None).all(), features.cell_type)
dataset.labels.add(
    lb.DevelopmentalStage.filter().exclude(files=None).all(), features.development_stage
)
dataset.labels.add(lb.Disease.filter().exclude(files=None).all(), features.disease)
dataset.labels.add(
    lb.Ethnicity.filter().exclude(files=None).all(), features.self_reported_ethnicity
)
dataset.labels.add(lb.Phenotype.filter().exclude(files=None).all(), features.sex)
dataset.labels.add(lb.Tissue.filter().exclude(files=None).all(), features.tissue)

dataset.describe()

Dataset(uid='OirHTWDrudY2TYltvIX1', name='cellxgene-census', version='2023-07-25', hash='pEJ9uvIeTLvHkZW2TBT5', visibility=1, updated_at=2023-11-28 21:46:40 UTC)

Provenance:
  📔 transform: Transform(uid='pNa7RdI26sp4z8', name='Register files from Census release 2023-07-25', short_name='census-release-2023-07-25', version='0', type='notebook', updated_at=2023-11-28 21:30:25 UTC, created_by_id=1)
  👣 run: Run(uid='ZYgsnqK5v2hPmFlS0kfG', run_at=2023-11-29 10:04:46 UTC, transform_id=11, created_by_id=1)
  👤 created_by: User(uid='kmvZDIX9', handle='sunnyosun', name='Sunny Sun', updated_at=2023-11-28 21:14:48 UTC)
  ⬇️ input_of (core.Run): ['2023-11-29 12:51:05 UTC']
Features:
  obs: FeatureSet(uid='kwKICViF5O3QjHdg0nov', name='obs features', n=9, type='category', registry='core.Feature', hash='Bx10EzvDxdlAVjqVKdKC', updated_at=2023-11-29 09:28:28 UTC, created_by_id=1)
    🔗 assay (32, bionty.ExperimentalFactor): 'Seq-Well S3', 'GEXSCOPE technology', 'sci-Plex', 'DroNc-seq', 'MERFISH', 'snmC-Seq2', 'CEL-seq2', '10x 5' transcription profiling', 'Drop-seq', 'microwell-seq', ...
    🔗 cell_type (699, bionty.CellType): 'cell of skeletal muscle', 'T-helper 1 cell', 'mesothelial fibroblast', 'kidney collecting duct epithelial cell', 'microglial cell', 'type G enteroendocrine cell', 'pericyte', 'supporting cell', 'CD14-positive, CD16-positive monocyte', 'retinal ganglion cell', ...
    🔗 development_stage (215, bionty.DevelopmentalStage): 'Theiler stage 19', '16 weeks', '17 weeks', 'Theiler stage 21', '26 weeks', '7 weeks', '8 month-old stage', '15 weeks', '5 month-old stage', '5 weeks', ...
    🔗 disease (76, bionty.Disease): 'epilepsy', 'long COVID-19', 'brain neoplasm', 'Alzheimer disease', 'influenza', 'Crohn disease', 'systemic lupus erythematosus', 'acute promyelocytic leukemia', 'squamous cell lung carcinoma', 'B-cell non-Hodgkin lymphoma', ...
    🔗 donor_id (6871, core.ULabel): 'D367', 'H20.33.032', '372317', 'SG_HEL_H136', 'SF11644', 'H18.03.318', '252599', 'KR_SGI_H049', 'homosapiens_None_2023_None_sikkemalisa_002_d10_1101_2022_03_10_483747210I', '426003', ...
    🔗 self_reported_ethnicity (28, bionty.Ethnicity): 'Singaporean Chinese', 'African American', 'Irish', 'South Asian', 'Pacific Islander', 'Bangladeshi', 'Hispanic or Latin American', 'admixed ancestry', 'Oceanian', 'European', ...
    🔗 sex (3, bionty.Phenotype): 'unknown', 'male', 'female'
    🔗 suspension_type (3, core.ULabel): 'cell', 'nucleus', 'na'
    🔗 tissue (298, bionty.Tissue): 'nose', 'cervical lymph node', 'body of stomach', 'bronchus', 'tongue', 'fimbria of uterine tube', 'renal glomerulus', 'olfactory region', 'mesenteric artery', 'subcutaneous abdominal adipose tissue', ...
  external: FeatureSet(uid='zIgncie4AywRKgLmKHUW', name='external features', n=2, type='category', registry='core.Feature', hash='5E4xD6tOhDB5EOnLx3tv', updated_at=2023-11-29 09:28:20 UTC, created_by_id=1)
    🔗 organism (5, bionty.Organism): 'domestic pig', 'mouse', 'white-tufted-ear marmoset', 'human', 'rhesus macaque'
    🔗 collection (146, core.ULabel): 'Abdominal White Adipose Tissue', 'A molecular single-cell lung atlas of lethal COVID-19', 'Spatial multiomics map of trophoblast development in early pregnancy', 'Blood and immune development in human fetal bone marrow and Down syndrome', 'Mapping the developing human immune system across organs', 'Evolution of cellular diversity in primary motor cortex of human, marmoset monkey, and mouse', 'Construction of a human cell landscape at single-cell level', 'Impaired local intrinsic immunity to SARS-CoV-2 infection in severe COVID-19', 'Single-cell transcriptomes of the human skin reveal age-related loss of fibroblast priming', 'A single-cell transcriptome atlas of the adult human retina', ...
  var-ercc: FeatureSet(uid='VDiO6vtqPe58U4HJPHeD', name='all synthetic construct genes', n=92, type='number', registry='bionty.Gene', hash='rMxzn166gRykjOZFnWRy', updated_at=2023-11-29 09:26:16 UTC, created_by_id=1)
    'ERCC-00002 (spike-in control)', 'ERCC-00003 (spike-in control)', 'ERCC-00004 (spike-in control)', 'ERCC-00009 (spike-in control)', 'ERCC-00012 (spike-in control)', 'ERCC-00013 (spike-in control)', 'ERCC-00014 (spike-in control)', 'ERCC-00016 (spike-in control)', 'ERCC-00017 (spike-in control)', 'ERCC-00019 (spike-in control)', 'ERCC-00022 (spike-in control)', 'ERCC-00024 (spike-in control)', 'ERCC-00025 (spike-in control)', 'ERCC-00028 (spike-in control)', 'ERCC-00031 (spike-in control)', 'ERCC-00033 (spike-in control)', 'ERCC-00034 (spike-in control)', 'ERCC-00035 (spike-in control)', 'ERCC-00039 (spike-in control)', 'ERCC-00040 (spike-in control)', ...
  var-mouse: FeatureSet(uid='h10gJKScXD72BjnxbIHD', name='all mouse genes', n=55416, type='number', registry='bionty.Gene', hash='umPHI2jmFQXA78M69WBD', updated_at=2023-11-29 09:26:09 UTC, created_by_id=1)
    '4933401J01Rik', 'Gm26206', 'Xkr4', 'Gm18956', 'Gm37180', 'Gm37363', 'Gm37686', 'Gm1992', 'Gm37329', 'Gm7341', 'Gm38148', 'Gm19938', 'Gm10568', 'Gm38385', 'Gm27396', 'Gm37381', 'Rp1', 'Gm6101', 'Gm37483', 'Sox17', ...
  var-human: FeatureSet(uid='CXzMBf4cCDtBq8N5Sg4a', name='all human genes', n=60664, type='number', registry='bionty.Gene', hash='DOnOv7runwo4TOR5P_do', updated_at=2023-11-29 10:29:23 UTC, created_by_id=1)
    'DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2', 'FAM138A', 'OR4G4P', 'OR4G11P', 'OR4F5', 'None', 'None', 'CICP27', 'None', 'None', 'None', 'None', 'RNU6-1100P', 'None', 'DDX11L17', 'WASH9P', ...
  var-sars-cov-2: FeatureSet(uid='Q1oqEHSXHAogP5Ralgw2', name='all sars-2 genes', n=12, type='number', registry='bionty.Gene', hash='CLCjr_EazVM8KxnA7jhc', updated_at=2023-11-29 10:09:19 UTC, created_by_id=1)
    'ORF1ab_ENSSASG00005000002', 'ORF1ab_ENSSASG00005000003', 'S', 'ORF3a', 'E', 'M', 'ORF6', 'ORF7a', 'ORF7b', 'ORF8', 'N', 'ORF10'
Labels:
  🏷️ organism (5, bionty.Organism): 'domestic pig', 'mouse', 'white-tufted-ear marmoset', 'human', 'rhesus macaque'
  🏷️ tissues (298, bionty.Tissue): 'nose', 'cervical lymph node', 'body of stomach', 'bronchus', 'tongue', 'fimbria of uterine tube', 'renal glomerulus', 'olfactory region', 'mesenteric artery', 'subcutaneous abdominal adipose tissue', ...
  🏷️ cell_types (699, bionty.CellType): 'cell of skeletal muscle', 'T-helper 1 cell', 'mesothelial fibroblast', 'kidney collecting duct epithelial cell', 'microglial cell', 'type G enteroendocrine cell', 'pericyte', 'supporting cell', 'CD14-positive, CD16-positive monocyte', 'retinal ganglion cell', ...
  🏷️ diseases (76, bionty.Disease): 'epilepsy', 'long COVID-19', 'brain neoplasm', 'Alzheimer disease', 'influenza', 'Crohn disease', 'systemic lupus erythematosus', 'acute promyelocytic leukemia', 'squamous cell lung carcinoma', 'B-cell non-Hodgkin lymphoma', ...
  🏷️ phenotypes (3, bionty.Phenotype): 'unknown', 'male', 'female'
  🏷️ experimental_factors (32, bionty.ExperimentalFactor): 'Seq-Well S3', 'GEXSCOPE technology', 'sci-Plex', 'DroNc-seq', 'MERFISH', 'snmC-Seq2', 'CEL-seq2', '10x 5' transcription profiling', 'Drop-seq', 'microwell-seq', ...
  🏷️ developmental_stages (215, bionty.DevelopmentalStage): 'Theiler stage 19', '16 weeks', '17 weeks', 'Theiler stage 21', '26 weeks', '7 weeks', '8 month-old stage', '15 weeks', '5 month-old stage', '5 weeks', ...
  🏷️ ethnicities (28, bionty.Ethnicity): 'Singaporean Chinese', 'African American', 'Irish', 'South Asian', 'Pacific Islander', 'Bangladeshi', 'Hispanic or Latin American', 'admixed ancestry', 'Oceanian', 'European', ...
  🏷️ ulabels (7020, core.ULabel): 'Abdominal White Adipose Tissue', 'A molecular single-cell lung atlas of lethal COVID-19', 'Spatial multiomics map of trophoblast development in early pregnancy', 'Blood and immune development in human fetal bone marrow and Down syndrome', 'Mapping the developing human immune system across organs', 'Evolution of cellular diversity in primary motor cortex of human, marmoset monkey, and mouse', 'Construction of a human cell landscape at single-cell level', 'Impaired local intrinsic immunity to SARS-CoV-2 infection in severe COVID-19', 'Single-cell transcriptomes of the human skin reveal age-related loss of fibroblast priming', 'A single-cell transcriptome atlas of the adult human retina', ...