Prepare gene sets • hitype

Using gene sets from ScType

hitype is designed to be compatible with ScType. So the gene sets provided by ScType can be used directly.

library(hitype)
#> The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
#> which was just loaded, were retired in October 2023.
#> Please refer to R-spatial evolution reports for details, especially
#> https://r-spatial.org/r/2023/05/15/evolution4.html.
#> It may be desirable to make the sf package available;
#> package maintainers should consider adding sf to Suggests:.

gs <- gs_prepare(
  "https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_short.xlsx"
)
gs$gene_sets[[1]]$`Acinar cells`
#> $markers
#>  [1] "CTRB1"    "KLK1"     "RBPJL"    "PTF1A"    "CELA3A"   "PRSS1"   
#>  [7] "SPINK1"   "ZG16"     "CEL"      "CELA2A"   "CPB1"     "CELA1"   
#> [13] "RNASE1"   "AMY2B"    "CPA2"     "CPA1"     "CELA3B"   "PNLIP"   
#> [19] "CTRB2"    "PLA2G1B"  "PRSS2"    "CLPS"     "REG1A"    "SYCN"    
#> [25] "PNLIPRP1" "CTRC"     "REG3A"    "PRSS3"    "REG1B"    "CFB"     
#> [31] "GDF15"    "MUC1"     "C15orf48" "AKR1C3"   "OLFM4"    "GSTA1"   
#> [37] "LGALS2"   "PDZK1IP1" "RARRES2"  "CXCL17"   "GSTA2"    "ANPEP"   
#> [43] "LYZ"      "ANGPTL4"  "ALDOB"   
#> 
#> $weights
#>  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
#> [39] 1 1 1 1 1 1 1

# gs <- gs_prepare(
#  "https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_long.xlsx"
# )

These two gene sets are now also built into hitype:

# gs <- gs_prepare(hitypedb_short)
# gs <- gs_prepare(hitypedb_full)

Built-in gene sets

hitype also provides some built-in gene sets:

Gene sets from ScType_short

head(hitypedb_short)
#>      tissueType         cellName
#> 1 Immune system      Pro-B cells
#> 2 Immune system      Pre-B cells
#> 3 Immune system Immature B cells
#> 4 Immune system    Naive B cells
#> 5 Immune system   Memory B cells
#> 6 Immune system   Plasma B cells
#>                                                                                                                                                              geneSymbolmore1
#> 1                                       CD27,IgD,CD24,PTPRC,PAX5,CD24,CD38,CD79A,DNTT,C10orf10,VPREB1,ARPP21,CD99,IGLL1,CD9,CD79B,TCL1A,IGLL5,HLA-DQA1,HLA-DQB1,VPREB3,IGLL5
#> 2                                            CD19,CD27,IgD,CD24,PTPRC,PAX5,CD24,CD38,CD79A,NSMCE1,PCDH9,ACSM3,CCDC191,TCL1A,CD79B,TCL1A,IGLL5,HLA-DQA1,HLA-DQB1,VPREB3,IGLL5
#> 3                                                CD19,CD27,IgD,CD24,CD20,MS4A1,PTPRC,PAX5,CD24,CD38,CD79A,CXCL8,S100A12,LYZ,CD79B,TCL1A,IGLL5,HLA-DQA1,HLA-DQB1,VPREB3,IGLL5
#> 4           CD19,IgD,CD38,CD24,CD20,MS4A1,PTPRC,PAX5,CD24,CD38,CD79A,JCHAIN,SSR4,FKBP11,SEC11C,DERL3,PRDX4,IGLL5,CD79B,TCL1A,IGLL5,HLA-DQA1,HLA-DQB1,CD138,CD38,VPREB3,IGLL5
#> 5 CD19,CD27,IgD,CD38,CD24,CD20,MS4A1,PTPRC,PAX5,CD24,CD38,CD79A,JCHAIN,SSR4,FKBP11,SEC11C,DERL3,PRDX4,IGLL5,CD79B,TCL1A,IGLL5,HLA-DQA1,HLA-DQB1,CD138,CD38,CD27,VPREB3,IGLL5
#> 6      CD19,CD27,IgD,CD38,CD24,CD20,MS4A1,PTPRC,PAX5,CD24,CD38,CD79A,JCHAIN,SSR4,FKBP11,SEC11C,DERL3,PRDX4,IGLL5,CD79B,TCL1A,IGLL5,HLA-DQA1,HLA-DQB1,CD138,CD38,VPREB3,IGLL5
#>   geneSymbolmore2
#> 1            <NA>
#> 2            <NA>
#> 3            <NA>
#> 4            CD27
#> 5            <NA>
#> 6      CD20,MS4A1

Gene sets from ScType_full

head(hitypedb_full)
#>      tissueType           cellName
#> 1 Immune system        Pro-B cells
#> 2 Immune system        Pre-B cells
#> 3 Immune system      Naive B cells
#> 4 Immune system     Memory B cells
#> 5 Immune system     Plasma B cells
#> 6 Immune system Naive CD8+ T cells
#>                                                                                                                                                                     geneSymbolmore1
#> 1                                              CD27,IgD,CD24,PTPRC,PAX5,CD24,CD38,CD79A,DNTT,C10orf10,VPREB1,ARPP21,CD99,IGLL1,CD9,CD79B,TCL1A,IGLL5,HLA-DQA1,HLA-DQB1,VPREB3,IGLL5
#> 2                                                   CD19,CD27,IgD,CD24,PTPRC,PAX5,CD24,CD38,CD79A,NSMCE1,PCDH9,ACSM3,CCDC191,TCL1A,CD79B,TCL1A,IGLL5,HLA-DQA1,HLA-DQB1,VPREB3,IGLL5
#> 3                  CD19,IgD,CD38,CD24,CD20,MS4A1,PTPRC,PAX5,CD24,CD38,CD79A,JCHAIN,SSR4,FKBP11,SEC11C,DERL3,PRDX4,IGLL5,CD79B,TCL1A,IGLL5,HLA-DQA1,HLA-DQB1,CD138,CD38,VPREB3,IGLL5
#> 4        CD19,CD27,IgD,CD38,CD24,CD20,MS4A1,PTPRC,PAX5,CD24,CD38,CD79A,JCHAIN,SSR4,FKBP11,SEC11C,DERL3,PRDX4,IGLL5,CD79B,TCL1A,IGLL5,HLA-DQA1,HLA-DQB1,CD138,CD38,CD27,VPREB3,IGLL5
#> 5                  CD27,IgD,CD38,CD24,CD20,MS4A1,PTPRC,PAX5,CD24,CD38,CD79A,JCHAIN,SSR4,FKBP11,SEC11C,DERL3,PRDX4,IGLL5,CD79B,TCL1A,IGLL5,HLA-DQA1,HLA-DQB1,CD138,CD38,VPREB3,IGLL5
#> 6 CD8,CD2,CD3D,CD3E,CD3G,CD3Z,CD45RA,CD62L,CD27,CD127,FOXP3,CCR7,CD45,CD8A,CD8B,CCR6,CD11b,CD30,CD6,CTLA4,IL2RA,GZMB,PTPRC,SELL,CCR7,GNLY,Trac,Ltb,Cd52,Trbc2,Shisa5,Lck,Thy1,Dapl1
#>               geneSymbolmore2    shortName
#> 1                        <NA>        Pro-B
#> 2                        <NA>        Pre-B
#> 3                        <NA>      Naive B
#> 4                        <NA>     Memory B
#> 5                  CD20,MS4A1     Plasma B
#> 6 CD25,CD44,CD69,HLA-DRA,CD95 Naive CD8+ T

Gene sets and weights trained from PBMC 3k dataset

hitypedb_pbmc3k
#>       cellName level geneSymbolmore2       geneSymbolmore1
#> 1            B     1                                MS4A1+
#> 2   CD14+ Mono     1                           CD14++,LYZ+
#> 3       CD8+ T     1                             CD8A+++++
#> 4           DC     1                  FCER1A+++++,CST3++++
#> 5 FCFR3A+ Mono     1                 FCGR3A+++++,MS4A7++++
#> 6  Memory CD4+     1                      IL7R++++,S100A4+
#> 7           NK     1                         GNLY++,NKG7++
#> 8 Naive CD4+ T     1                          IL7R+,CCR7++
#> 9     Platelet     1                             PPBP+++++

Preparing your own gene sets

You can also prepare your own gene sets. The gene sets should be a data.frame or a tab-delimited file with the following columns:

tissueType (optional): Tissue type. One can pass tissue_type to gs_prepare() to filter gene sets by tissue type.
cellName (required): Cell type name.
geneSymbolmore1 (required): Markers for the cell type. Multiple markers should be separated by ,. One can use a suffix + to indicate that the marker is a positive marker, and - to indicate that the marker is a negative marker. Multiple +’s are allowed to indicate that the marker is a strong positive marker. Multiple -’s are allowed to indicate that the marker is a strong negative marker. For example, CD3E+++ indicates that CD3E is a strong positive marker, and CD14--- indicates that CD14 is a strong negative marker.
geneSymbolmore2 (required): Negative markers for the cell type. This column can be empty strings. This is kept for compatibility with ScType. No suffixes are allowed in this column. A marker in this column is the same as a marker in the previous column with a suffix -.
level (optional): Cell type level. The level of the cellName. It must start from 1 and increase by 1. The final cell type consists of one cellName from each level.
nextLevels: Indication of possible cellNames at the next level. Levels should be separated by ;. The cellName at each level should be separated by ,.
- The format is cellName1,cellName2;cellName3,cellName4;... for each cellName at the current level.
- For example, for CD4 at level 1, nextLevels Naive;Activated indicates that CD4 should be followed by Naive at level 2 and Activated at level 3.
- If the nextLevels is also specified for Naive at level 2, then the cellName at level 3 for CD4 will be the intersection of Activated and the cellNames at level 3 for Naive limited by nextLevels for Naive.
- If a level in nextLevels is empty, then the cellNames at that level will be all cellNames at the next level limited by nextLevels of that level.
- ! can be used to indicate that the cellNames at the next level should be excluded. For example, !Naive indicates that Naive should be excluded from the cellNames at the next level.
- If ! is the only character in a level, then all cellNames at that level will be excluded.