[ PROMPT_NODE_26210 ]
manipulation
[ SKILL_DOCUMENTATION ]
# 数据操作
用于转换、提取子集和操作 AnnData 对象的各种方法。
## 提取子集
### 按索引
python
import anndata as ad
import numpy as np
adata = ad.AnnData(X=np.random.rand(1000, 2000))
# 整数索引
subset = adata[0:100, 0:500] # 前 100 个观测值,前 500 个变量
# 索引列表
obs_indices = [0, 10, 20, 30, 40]
var_indices = [0, 1, 2, 3, 4]
subset = adata[obs_indices, var_indices]
# 单个观测值或变量
single_obs = adata[0, :]
single_var = adata[:, 0]
### 按名称
python
import pandas as pd
# 创建带有命名索引的对象
obs_names = [f'cell_{i}' for i in range(1000)]
var_names = [f'gene_{i}' for i in range(2000)]
adata = ad.AnnData(
X=np.random.rand(1000, 2000),
obs=pd.DataFrame(index=obs_names),
var=pd.DataFrame(index=var_names)
)
# 按观测值名称提取子集
subset = adata[['cell_0', 'cell_1', 'cell_2'], :]
# 按变量名称提取子集
subset = adata[:, ['gene_0', 'gene_10', 'gene_20']]
# 同时操作两个轴
subset = adata[['cell_0', 'cell_1'], ['gene_0', 'gene_1']]
### 按布尔掩码
python
# 创建布尔掩码
high_count_obs = np.random.rand(1000) > 0.5
high_var_genes = np.random.rand(2000) > 0.7
# 使用掩码提取子集
subset = adata[high_count_obs, :]
subset = adata[:, high_var_genes]
subset = adata[high_count_obs, high_var_genes]
### 按元数据条件
python
# 添加元数据
adata.obs['cell_type'] = np.random.choice(['A', 'B', 'C'], 1000)
adata.obs['quality_score'] = np.random.rand(1000)
adata.var['highly_variable'] = np.random.rand(2000) > 0.8
# 按细胞类型过滤
t_cells = adata[adata.obs['cell_type'] == 'A']
# 按多个条件过滤
high_quality_a_cells = adata[
(adata.obs['cell_type'] == 'A') &
(adata.obs['quality_score'] > 0.7)
]
# 按变量元数据过滤
hv_genes = adata[:, adata.var['highly_variable']]
# 复杂条件
filtered = adata[
(adata.obs['quality_score'] > 0.5) &
(adata.obs['cell_type'].isin(['A', 'B'])),
adata.var['highly_variable']
]
## 转置
python
# 转置 AnnData 对象 (交换观测值和变量)
adata_T = adata.T
# 形状改变
print(adata.shape) # (1000, 2000)
print(adata_T.shape) # (2000, 1000)
# obs 和 var 被交换
print(adata.obs.head()) # 观测值元数据
print(adata_T.var.head()) # 相同数据,现在作为变量元数据
# 当数据方向相反时非常有用
# 常见于某些基因作为行的文件格式
## 复制
### 完全复制