"""
Functions for managing the GRASP database.
`get_session()` is used everywhere in the module to create a connection to the
database. `initialize_database()` is used to build the database from the GRASP
file. It takes about an hour 90 minutes to run and will overwrite any existing
database.
"""
import os as _os
from re import compile as _recompile
from bz2 import open as _bopen
from gzip import open as _zopen
from html import unescape as _unescape
# Date parsing
from datetime import date as _date
from dateutil.parser import parse as _dateparse
# Database
from sqlalchemy import create_engine as _create_engine
from sqlalchemy.sql import select as _select
from sqlalchemy.orm import sessionmaker as _sessionmaker
# Progress bar
from tqdm import tqdm as _tqdm
# Table Declarations
from .tables import Base as _Base
from .tables import Study as _Study
from .tables import SNP as _SNP
from .tables import Phenotype as _Phenotype
from .tables import PhenoCats as _PhenoCats
from .tables import Platform as _Platform
from .tables import Population as _Population
from .tables import snp_pheno_assoc as _snp_pheno_assoc
from .tables import study_pheno_assoc as _study_pheno_assoc
from .tables import study_plat_assoc as _study_plat_assoc
# Database config
from .config import config as _config
# Reference objects
from .ref import PopFlag as _PopFlag
from .ref import pheno_synonyms
from .ref import pop_correction
__all__ = ["get_session", "initialize_database"]
###############################################################################
# Core Functions #
###############################################################################
[docs]def get_session(echo=False):
"""Return a session and engine, uses config file.
Args:
echo: Echo all SQL to the console.
Returns:
session, engine: A SQLAlchemy session and engine object corresponding
to the grasp database for use in querying.
"""
db_type = _config['DEFAULT']['DatabaseType']
if db_type == 'sqlite':
engine_string = ('sqlite:///{db_file}'
.format(db_file=_config['sqlite']['DatabaseFile']))
else:
engine_string = ('{type}://{user}:{passwd}@{host}/grasp'
.format(type=db_type,
user=_config['other']['DatabaseUser'],
passwd=_config['other']['DatabasePass'],
host=_config['other']['DatabaseHost']))
engine = _create_engine(engine_string, echo=echo)
Session = _sessionmaker(bind=engine)
return Session(), engine
[docs]def initialize_database(study_file, grasp_file, commit_every=250000,
progress=False):
"""Create the database quickly.
:study_file: Tab delimited GRASP study file, available here:
`<github.com/MikeDacre/grasp/blob/master/grasp_studies.txt>`_
:grasp_file: Tab delimited GRASP file.
:commit_every: How many rows to go through before commiting to disk.
:progress: Display a progress bar (db length hard coded).
"""
rows = 0
count = commit_every
pphenos = {}
phenos = {}
platforms = {}
populations = {}
# Create tables
_, engine = get_session()
print('Dropping and creating database tables, this may take a while if',
'the old database is large.')
if _config['DEFAULT']['DatabaseType'] == 'sqlite':
cfile = _os.path.isfile(_config['sqlite']['DatabaseFile'])
if _os.path.isfile(cfile):
_os.remove(cfile)
_Base.metadata.drop_all(engine)
_Base.metadata.create_all(engine)
print('Tables created.')
conn = engine.connect()
# Get tables
study_table = _Study.__table__
snp_table = _SNP.__table__
pheno_table = _Phenotype.__table__
pcat_table = _PhenoCats.__table__
plat_table = _Platform.__table__
pop_table = _Population.__table__
# Create insert statements
study_ins = study_table.insert()
snp_ins = snp_table.insert()
pheno_ins = pheno_table.insert()
pcat_ins = pcat_table.insert()
plat_ins = plat_table.insert()
pop_ins = pop_table.insert()
phsnp_ins = _snp_pheno_assoc.insert()
phstudy_ins = _study_pheno_assoc.insert()
plstudy_ins = _study_plat_assoc.insert()
# Unique ID counters
spare_id = 1
pheno_id = 1
pcat_id = 1
plat_id = 1
pop_id = 1
# Lists to hold records
pheno_records = []
pcat_records = []
plat_records = []
pop_records = []
study_records = []
snp_records = []
phsnp_records = []
phstudy_records = []
plstudy_records = []
# Platform parsing regex
plat_parser = _recompile(r'^([^[]*)\[([^]]+)\]?(.*)')
# Build study information from study file
print('Parsing study information.')
with _open_zipped(study_file) as fin:
# Drop header
fin.readline()
if progress:
pbar = _tqdm(total=2083, unit='studies')
for line in fin:
f = line.rstrip().split('\t')
# Get primary phenotype
ppheno = _cleanstr(f[7].strip())
if ppheno not in pphenos:
pheno_records.append({'phenotype': ppheno,
'id': pheno_id})
pphenos[ppheno] = pheno_id
pheno_id += 1
# Get phenotype categories
pheno_cats = f[8].strip().split(';')
our_phenos = []
for pcat in pheno_cats:
pcat = pcat.strip()
if not pcat:
continue
if pcat not in phenos:
pcat_records.append({
'id': pcat_id,
'category': pcat,
'alias': pheno_synonyms[pcat],
})
phenos[pcat] = pcat_id
pcat_id += 1
our_phenos.append(phenos[pcat])
# Get platform info
our_platforms = []
try:
plat, snp_count, impt = [
i.strip() for i in plat_parser.findall(f[18].strip())[0]
]
imputed = True if impt == '(imputed)' else False
plats = _split_mesy_list(plat)
for plat in plats:
plat = plat.strip()
if plat not in platforms:
plat_records.append({'id': plat_id,
'platform': plat})
platforms[plat] = plat_id
plat_id += 1
our_platforms.append(platforms[plat])
except IndexError:
plat, snp_count, impt = None, None, None
imputed = None
# Get population description
try:
pop = f[19].strip()
try:
pop = pop_correction[pop]
except KeyError:
pass
if pop not in populations:
pop_records.append({'id': pop_id,
'population': pop})
populations[pop] = pop_id
pop_id += 1
population = populations[pop]
except IndexError:
population = None
# Set populaion flags
pflag = _PopFlag
disc_pop = pflag(0)
rep_pop = pflag(0)
l = len(f)
if l > 22 and f[22]:
disc_pop |= pflag.eur
if l > 23 and f[23]:
disc_pop |= pflag.afr
if l > 24 and f[24]:
disc_pop |= pflag.east_asian
if l > 25 and f[25]:
disc_pop |= pflag.south_asian
if l > 26 and f[26]:
disc_pop |= pflag.his
if l > 27 and f[27]:
disc_pop |= pflag.native
if l > 28 and f[28]:
disc_pop |= pflag.micro
if l > 29 and f[29]:
disc_pop |= pflag.arab
if l > 30 and f[30]:
disc_pop |= pflag.mix
if l > 31 and f[31]:
disc_pop |= pflag.uns
if l > 32 and f[32]:
disc_pop |= pflag.filipino
if l > 33 and f[33]:
disc_pop |= pflag.indonesian
if l > 35 and f[35]:
rep_pop |= pflag.eur
if l > 36 and f[36]:
rep_pop |= pflag.afr
if l > 37 and f[37]:
rep_pop |= pflag.east_asian
if l > 38 and f[38]:
rep_pop |= pflag.south_asian
if l > 39 and f[39]:
rep_pop |= pflag.his
if l > 40 and f[40]:
rep_pop |= pflag.native
if l > 41 and f[41]:
rep_pop |= pflag.micro
if l > 42 and f[42]:
rep_pop |= pflag.arab
if l > 43 and f[43]:
rep_pop |= pflag.mix
if l > 44 and f[44]:
rep_pop |= pflag.uns
if l > 45 and f[45]:
rep_pop |= pflag.filipino
if l > 46 and f[46]:
rep_pop |= pflag.indonesian
# Set the global population flag
pop_flag = disc_pop | rep_pop
# Create study
study_records.append({
'id': int(f[0]),
'author': _cleanstr(f[1]),
'pmid': _cleanstr(f[2]),
'grasp_ver': 1 if '1.0' in f[3] else 2,
'noresults': True if f[4] else False,
'results': int(f[5]),
'qtl': True if f[6] == '1' else False,
'phenotype_id': pphenos[ppheno],
'phenotype_desc': ppheno,
'phenotype': pphenos[ppheno],
'phenotype_cats': our_phenos,
'datepub': _get_date(f[9]),
'in_nhgri': _get_bool(f[10]),
'journal': _cleanstr(f[11]),
'title': _cleanstr(f[12]),
'locations': _cleanstr(f[13]),
'mf': _get_bool(f[14]),
'mf_only': _get_bool(f[15]),
'sample_size': _cleanstr(f[16]),
'replication_size': _cleanstr(f[17]),
'platforms': platforms,
'snp_count': snp_count,
'imputed': imputed,
'population_id': population,
'population': population,
'total': int(f[20]),
'total_disc': int(f[21]),
'pop_flag': int(pop_flag),
'disc_pop_flag': int(disc_pop),
'european': int(f[22]) if l > 22 and f[22] else None,
'african': int(f[23]) if l > 23 and f[23] else None,
'east_asian': int(f[24]) if l > 24 and f[24] else None,
'south_asian': int(f[25]) if l > 25 and f[25] else None,
'hispanic': int(f[26]) if l > 26 and f[26] else None,
'native': int(f[27]) if l > 27 and f[27] else None,
'micronesian': int(f[28]) if l > 28 and f[28] else None,
'arab': int(f[29]) if l > 29 and f[29] else None,
'mixed': int(f[30]) if l > 30 and f[30] else None,
'unspecified': int(f[31]) if l > 31 and f[31] else None,
'filipino': int(f[32]) if l > 32 and f[32] else None,
'indonesian': int(f[33]) if l > 33 and f[33] else None,
'total_rep': int(f[34]) if l > 34 and f[34] else None,
'rep_pop_flag': int(rep_pop),
'rep_european': int(f[35]) if l > 35 and f[35] else None,
'rep_african': int(f[36]) if l > 36 and f[36] else None,
'rep_east_asian': int(f[37]) if l > 37 and f[37] else None,
'rep_south_asian': int(f[38]) if l > 38 and f[38] else None,
'rep_hispanic': int(f[39]) if l > 39 and f[39] else None,
'rep_native': int(f[40]) if l > 40 and f[40] else None,
'rep_micronesian': int(f[41]) if l > 41 and f[41] else None,
'rep_arab': int(f[42]) if l > 42 and f[42] else None,
'rep_mixed': int(f[43]) if l > 43 and f[43] else None,
'rep_unspecified': int(f[44]) if l > 44 and f[44] else None,
'rep_filipino': int(f[45]) if l > 45 and f[45] else None,
'rep_indonesian': int(f[46]) if l > 46 and f[46] else None,
})
# Create association records
for i in our_phenos:
phstudy_records.append({'study_id': int(f[0]),
'pheno_id': i})
for i in our_platforms:
plstudy_records.append({'study_id': int(f[0]),
'platform_id': i})
pbar.update()
pbar.close()
print('Writing study information...')
conn.execute(pheno_ins, pheno_records)
conn.execute(pcat_ins, pcat_records)
conn.execute(plat_ins, plat_records)
conn.execute(pop_ins, pop_records)
conn.execute(study_ins, study_records)
conn.execute(phstudy_ins, phstudy_records)
conn.execute(plstudy_ins, plstudy_records)
print('Done')
# Reinitialize lists for main GRASP parser
pheno_records = []
pcat_records = []
plat_records = []
pop_records = []
# Get full study info from database for use in SNPs
sinfo = conn.execute(_select([study_table.c.id, study_table.c.pmid])).fetchall()
studies = {}
for i, p in sinfo:
studies[p] = i
no_pmid = {
'Dissertation (https://openaccess.leidenuniv.nl/handle/1887/17746)': 1,
'KARE Genomewide Association Study of Blood Pressure Using Imputed SNPs': 2,
'Genome-wide Association Study Identification of a New Genetic Locus with Susceptibility to Osteoporotic Fracture in the Korean Population.': 3,
'Genome-wide Association Study Identified TIMP2 Genetic Variant with Susceptibility to Osteoarthritis': 4,
'Application of Structural Equation Models to Genome-wide Association Analysis ': 5,
'Comparison of Erythrocyte Traits Among European, Japanese and Korean': 6,
'Genomewide Association Study Identification of a New Genetic Locus with Susceptibility to Osteoporotic Fracture in the Korean Population': 7,
'Joint identification of multiple genetic variants of obesity in A Korean Genome-wide association study': 8,
'Genome-Wide Association Analyses on Blood Pressure Using Three Different Phenotype Definitions': 9,
'Association of intronic sequence variant in the gene encoding spleen tyrosine kinase with susceptibility to vascular dementia': 10,
}
print('Parsing SNP information...')
with _open_zipped(grasp_file, encoding='latin1') as fin:
# Drop header
fin.readline()
if progress:
pbar = _tqdm(total=8864717, unit='snps')
for line in fin:
f = line.rstrip().split('\t')
# Get primary phenotype
ppheno = _cleanstr(f[11])
# These are poorly curated, so there is no need to use a
# separate table for them.
# if ppheno not in pphenos:
# conn.execute(pheno_ins.values(
# phenotype=ppheno
# ))
# pphenos[ppheno] = conn.execute(
# select([pheno_table.c.id]).where(
# pheno_table.c.phenotype == ppheno
# )
# ).first()[0]
# Get phenotype categories
pheno_cats = f[13].strip().split(';')
our_phenos = []
for pcat in pheno_cats:
pcat = pcat.strip()
if not pcat:
continue
if pcat not in phenos:
pcat_records.append({
'id': pcat_id,
'category': pcat,
'alias': pheno_synonyms[pcat],
})
phenos[pcat] = pcat_id
pcat_id += 1
our_phenos.append(phenos[pcat])
# Get population description
try:
pop = f[23].strip()
try:
pop = pop_correction[pop]
except KeyError:
pass
if pop not in populations:
pop_records.append({'id': pop_id,
'population': pop})
populations[pop] = pop_id
pop_id += 1
population = populations[pop]
except IndexError:
population = None
# Create record for SNP
try:
sid = int(f[0])
except ValueError:
sid = spare_id
spare_id += 1
l = len(f)
try:
study = studies[f[7].strip()]
except KeyError:
study = no_pmid[f[17].strip()]
record = {
'id': sid,
'NHLBIkey': f[0],
'HUPfield': f[1],
'LastCurationDate': _get_date(f[2]),
'CreationDate': _get_date(f[3]),
'snpid': f[4],
'chrom': f[5],
'pos': int(f[6]),
'population_id': population,
'population': population,
'study_id': study,
'study': study,
'study_snpid': f[8],
'paper_loc': f[9],
'pval': float(f[10]) if f[10] else None,
'phenotype_desc': ppheno,
'phenotype_cats': our_phenos,
}
record['InGene'] = f[51] if l > 52 else None
record['NearestGene'] = f[52] if l > 53 else None
record['InLincRNA'] = f[53] if l > 54 else None
record['InMiRNA'] = f[54] if l > 55 else None
record['InMiRNABS'] = f[55] if l > 56 else None
record['dbSNPfxn'] = f[56] if l > 57 else None
record['dbSNPMAF'] = f[57] if l > 58 else None
record['dbSNPinfo'] = f[58] if l > 59 else None
record['dbSNPvalidation'] = f[59] if l > 60 else None
record['dbSNPClinStatus'] = f[60] if l > 61 else None
record['ORegAnno'] = f[61] if l > 62 else None
record['ConservPredTFBS'] = f[62] if l > 63 else None
record['HumanEnhancer'] = f[63] if l > 64 else None
record['RNAedit'] = f[64] if l > 65 else None
record['PolyPhen2'] = f[65] if l > 66 else None
record['SIFT'] = f[66] if l > 67 else None
record['LSSNP'] = f[67] if l > 68 else None
record['UniProt'] = f[68] if l > 69 else None
record['EqtlMethMetabStudy'] = f[69] if l > 70 else None
snp_records.append(record)
# Create association records
for i in our_phenos:
phsnp_records.append({'snp_id' : sid, 'pheno_id' : i})
# Decide when to execute
if count:
count -= 1
else:
if progress:
pbar.write('Writing rows...')
else:
print('Writing rows...')
if pcat_records:
conn.execute(pcat_ins, pcat_records)
if plat_records:
conn.execute(plat_ins, plat_records)
if pop_records:
conn.execute(pop_ins, pop_records)
conn.execute(snp_ins, snp_records)
conn.execute(phsnp_ins, phsnp_records)
if progress:
pbar.write('{} rows written'.format(rows))
else:
print('{} rows written'.format(rows))
count = commit_every-1
pcat_records = []
plat_records = []
pop_records = []
snp_records = []
phsnp_records = []
rows += 1
if progress:
pbar.update()
# Final insert
pbar.close()
print('Writing final rows...')
conn.execute(snp_ins, snp_records)
conn.execute(phsnp_ins, phsnp_records)
print('{} rows written'.format(rows))
print('Done!')
###############################################################################
# Support Functions #
###############################################################################
_rmspace = _recompile(r' +')
def _cleanstr(string):
"""Run a few regex cleanups on a string.
Strips unneeded starting characters, trims extra spaces, and converts
mangeled unicode characters from Excel into readable characters.
"""
cstr = _rmspace.sub(' ', string.strip(" \"'"))
return _unescape(cstr)
def _split_mesy_list(string):
"""Split a string that contains commas and 'ands/&'
:string: A string with commas/ands/&
:returns: A list
"""
init_list = [i.strip() for i in string.split(',') if i]
final_list = []
for i in init_list:
if i.isspace():
continue
andlist = i.split('and')
amplist = i.split('&')
if len(andlist) > 1:
for j in andlist:
if not j or j.isspace():
continue
final_list.append(j.strip())
elif len(amplist) > 1:
for j in amplist:
if not j or j.isspace():
continue
final_list.append(j.strip())
else:
final_list.append(i.strip())
final_list = [i.strip() for i in final_list if not i.isspace()]
return [i for i in final_list if i]
def _get_date(string):
"""Return datetime date object from string."""
try:
return _date.fromordinal(_dateparse(string).toordinal())
except ValueError:
print(string)
raise
def _get_bool(string):
"""Return a bool from a string of y/n."""
string = string.lower()
return True if string == 'y' else False
def _open_zipped(infile, mode='r', encoding='utf-8'):
""" Return file handle of file regardless of zipped or not
Text mode enforced for compatibility with python2 """
mode = mode[0] + 't'
p2mode = mode
if hasattr(infile, 'write'):
return infile
if isinstance(infile, str):
if infile.endswith('.gz'):
return _zopen(infile, mode)
if infile.endswith('.bz2'):
return _bopen(infile, mode)
return open(infile, p2mode, encoding=encoding)