from alphagenome.data import genome
from alphagenome.models import dna_client, variant_scorers
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
ALPHAGENOME_KEY=os.getenv('ALPHAGENOME_KEY')
save_predictions = True
# Load file which only has variants above -log10p of 7 in tsv format.
df = pd.read_csv('gwas1_log10p_gt_7.tsv', sep='\t')
df.CHROM = 'chr' + df.CHROM.astype(str)
# Create alphagenome client
dna_model = dna_client.create(ALPHAGENOME_KEY)
# Set up parameters:
# Interval: 1MB
# Scorer: RNAseq
sequence_length = '1MB' # @param ["16KB", "100KB", "500KB", "1MB"] { type:"string" }
sequence_length = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
f'SEQUENCE_LENGTH_{sequence_length}'
]
scorer_selections = {
'rna_seq': True,
'cage': False,
'procap': False,
'atac': False,
'dnase': False,
'chip_histone': False,
'chip_tf': False,
'polyadenylation': False,
'splice_sites': False,
'splice_site_usage': False,
'splice_junctions': False,
}
all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS
selected_scorers = [
all_scorers[key]
for key in all_scorers
if scorer_selections.get(key.lower(), False)
]
# Fetch scores
results = []
for i, row in tqdm(df.iterrows(), total=len(df)):
variant = genome.Variant(
chromosome=str(row.CHROM),
position=int(row.GENPOS),
reference_bases=row.ALLELE0,
alternate_bases=row.ALLELE1,
name=row.ID,
)
interval = variant.reference_interval.resize(sequence_length)
variant_scores = dna_model.score_variant(
interval=interval,
variant=variant,
variant_scorers=selected_scorers,
organism=dna_client.Organism.HOMO_SAPIENS,
)
results.append(variant_scores)
df_scores = variant_scorers.tidy_scores(results)
# Filter to protein-coding genes in the brain and save results
ontologies = ['UBERON:0000955'] # Brain
gene_types = ['protein_coding']
df_scores = df_scores[df_scores['ontology_curie'].isin(ontologies) & df_scores['gene_type'].isin(gene_types)]
df_scores['abs_quantile_score'] = abs(df_scores['quantile_score'])
df_scores = df_scores.sort_values('abs_quantile_score', ascending=False)
if save_predictions:
df_scores.to_excel('variant_scores.xlsx', index=False)