Module `omniplot.chipseq`

Expand source code

from typing import List,Dict,Optional,Union,Any,Iterable
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os 
import scipy.stats
from scipy.spatial.distance import squareform, pdist
from scipy.spatial import distance
from joblib import Parallel, delayed
import itertools as it
from joblib.externals.loky import get_reusable_executor
from omniplot.chipseq_utils import (_stitching_for_pyrange, 
                                    gff_parser, 
                                    _read_peaks, 
                                    _read_bw, 
                                    _calc_pearson,
                                    _stitching,
                                    _read_tss,
                                    _remove_close_to_tss,
                                    _find_extremes,
                                    _readgff,
                                    _readgff2,
                                    _readgtf,
                                    _readgtf2,
                                    _read_and_reshape_bw,
                                    _read_bw_stats,
                                    _readgff_transcripts,
                                    _read_transcripts)
sns.set_theme(font="Arial", style={'grid.linestyle': "",'axes.facecolor': 'whitesmoke'})
import random
import string
from matplotlib.colors import LogNorm
from sklearn.decomposition import PCA, NMF, LatentDirichletAllocation
from scipy.stats import fisher_exact
from scipy.stats import zscore
from sklearn.cluster import KMeans
from omniplot.utils import _optimal_kmeans, _save
import time
import pyranges as pr
import warnings
import sys

try:
    import pyBigWig as pwg
except ImportError as e:
    print("If you want to use chipseq functions. You need to install pyBigWig.")
except Exception as err:
    print(f"Unexpected {err=}, {type(err)=}")
    raise

def plot_bigwig(files: dict, 
                bed: Union[str, list], 
                gff: str,
                step: int=100,
                stack_regions: str="horizontal", 
                highlight: Union[str, list]=[],
                highlightname="",
                n_jobs=-1,
                save: str="")->dict:
    """
    Plotting bigwig files in specified genomic regions in the bed file.  
    
    Parameters
    ----------
    files : dict
        A dictionary whose keys are sample names and values are file names 
    bed : str
        A bed file name containing the list of genomic regions
    gff : str
        A gff3 file with a corresponding genome version
    step: int
        A bin size to reduce the bigwig signal resolution.
    show : bool
        Whether or not to show the figure.
    Returns
    -------
    {"ax":axes,"values":mat,"genes":geneset,"positions":pos} : dict
        A dictionary containing matplot axes, signal values, geneset, and positions.
    Raises
    ------
    Notes
    -----
    References
    ----------
    See Also
    --------
    Examples
    --------
    """ 
    for k, v in files.items():
        assert type(v)==str, "filenames must be str, but {} was give.".format(type(v))
        if not os.path.isfile(v)==True:
            raise Exception("{} does not exist.".format(v))
    if type(bed)==str:
        posrange=[]
        pos=[]
        with open(bed) as fin:
            
            for l in fin:
                l=l.split()
                chrom, s, e=l[0],l[1],l[2]
                posrange.append(pr.from_dict({"Chromosome": [chrom], "Start": [int(s.replace(",",""))], "End": [int(e.replace(",",""))]}))
                pos.append([chrom,int(s.replace(",","")),int(e.replace(",",""))])  
    else:
        pos=bed
        
    if type(highlight)==str:
        _highlight=[]
        with open(bed) as fin:
            for l in fin:
                l=l.split()
                chrom, s, e=l[0],l[1],l[2]
                _highlight.append([chrom,int(s.replace(",","")),int(e.replace(",",""))])
        highlight=_highlight
    
    time_start=time.time()
    if gff.endswith("gff") or gff.endswith("gff3"):
        gff_dict=_readgff2(gff, "gene", others=["gene_name", "Strand"])
    elif gff.endswith("gtf"):
        gff_dict=_readgtf2(gff, "transcript", others=["gene_name", "Strand"])
    else:
        raise Exception("The gff/gtf file name is required to have either gff, gff3, or gtf extension.")
    gff_ob=pr.from_dict(gff_dict)
    
    #
    # if gff.endswith("gff") or gff.endswith("gff3"):
    #     gff_ob=pr.read_gff3(gff)
    #     gff_ob=gff_ob[gff_ob.Feature=="gene"]
    #     gffformat="gff"
    # elif gff.endswith("gtf"):
    #     gff_ob=pr.read_gtf(gff)
    #     gff_ob=gff_ob[gff_ob.Feature=="transcript"]
    #     gffformat="gtf"
    print("Reading the gene annotation file took", time.time()-time_start)
    # geneset=[]
    # for chrom, start, end in pos:
    #     geneset.append(gff_ob.get_genes(chrom, start, end))
    time_start=time.time()
    geneset=[]
    _grs=Parallel(n_jobs=n_jobs)(delayed(gff_ob.overlap)(gr) for gr in posrange)
    # for gr in posrange:
    #     gr2=gff_ob.intersect(gr)
    for gr2 in _grs:
        #gr2=gff_ob.intersect(gr)
        if len(gr2)==0:
            geneset.append([])
        else:
            #print(gr2)
            # if gffformat=="gff":
            #     gr2=gr2[gr2.Feature=="gene"]
            # elif gffformat=="gtf":
            #     gr2=gr2[gr2.Feature=="transcript"]
                
            tmp=[]
            for gene_name, start, end, ori in zip(gr2.gene_name, gr2.Start, gr2.End, gr2.Strand):
                tmp.append([gene_name, start, end, ori])
            geneset.append(tmp)
    #print(geneset)
    #geneset=Parallel(n_jobs=-1)(delayed(gff_ob.get_genes)(chrom, start, end) for chrom, start, end in pos)
    print("Obtaining genes took", time.time()-time_start)
            
    time_start=time.time()
    mat={}
    for samplename, f in files.items():
        bw=pwg.open(f)
        #h, t=os.path.split(f)
        #samplename=t.replace(prefix, "").replace(suffix_set, "")
        mat[samplename]=[]
        
        for chrom, s, e in pos:
            val=bw.values(chrom, s, e)
            
            mat[samplename].append(val)
    
    
    
    print("Reading bigwig files took", time.time()-time_start)
    
    if stack_regions=="vertical":
        fig, axes=plt.subplots(nrows=len(pos)*2,ncols=len(files),figsize=[10,10],gridspec_kw={
                           'height_ratios': [2,1]*len(pos),"hspace":0},
                       constrained_layout = True)
        if len(axes.shape)==1:
            axes=axes.reshape([-1,1])
        sample_order=sorted(mat.keys())
        ticks_labels=[]
        for si, sample in enumerate(sample_order):
            
            vals=mat[sample]
            ymax=0
            for posi, (genes, val) in enumerate(zip(geneset, vals)):
                if len(val)==0:
                    continue
                ax=axes[posi*2,si]
                chrom, s, e=pos[posi]
                val=np.array(val)
                vallen=val.shape[0]
                remains=vallen%step
                _e=e-remains
                x=np.arange(s, _e,step)
                #print(s, e, _e, vallen, step)
                val=val[:vallen-remains].reshape(-1, step).mean(axis=1)
                _ymax=np.amax(val)
                if _ymax >  ymax:
                    ymax =_ymax
                if len(highlight)!=0:
                    hchrom, hs, he=highlight[posi]
                    ax.fill_between([hs, he], [_ymax, _ymax], alpha=0.5, color="r")
                ax.fill_between(x, val,edgecolor='face')
                
                
                if si==0:
                    ax.set_title(sample+" "+chrom+":"+str(s)+"-"+str(e))
                    ticks_labels.append([[np.amin(x),np.amax(x)],ax.get_xticks(), ax.get_xticklabels()])
                ax.set_xticks([],labels=[])
            occupied=[[] for i in range(8)]
        
        

        
        for posi in range(len(pos)):
            interval=pos[posi][2]-pos[posi][1]
            axes[posi*2+1,si].plot(ticks_labels[posi][0],[0,0],alpha=0)
            for genename, gs, ge,ori in geneset[posi]:
                slot=0
                for oi in range(len(occupied)):
                    oc=False
                    if len(occupied[oi])==0:
                        occupied[oi].append([gs, ge])
                        oc=False
                        slot=oi
                    else:
                        for tmps, tmpe in occupied[oi]:
                            if tmps<=gs<= tmpe or tmps<=ge<=tmpe:
                                oc=True
                        if oc==False:
                            occupied[oi].append([gs, ge])
                            slot=oi
                    if oc==False:
                        break
                if gs < pos[posi][1]:
                    gs=pos[posi][1]
                if ge > pos[posi][2]:
                    ge=pos[posi][2]
                axes[posi*2+1,si].plot([gs, ge], [slot*1,slot*1], color="gray")
                if ori=="+":
                    axes[posi*2+1,si].plot([ge-interval/32, ge], [slot*1+1,slot*1], color="gray")
                elif ori=="-":
                    axes[posi*2+1,si].plot([gs, gs+interval/32], [slot*1,slot*1+1], color="gray")
                
                axes[posi*2+1,si].text((gs+ge)/2, slot*1, genename, ha="center")
            for oi, ol in enumerate(occupied):
                if len(ol)==0:
                    maxslot=oi
                    break
            axes[posi*2+1,si].set_ylim(-0.5,maxslot+0.5)
            #axes[posi*2+1,si].ticklabel_format(useOffset=False)
            axes[posi*2+1,si].set_yticks([],labels=[])
            axes[posi*2+1,si].set_xticks(ticks_labels[posi][1])
            axes[posi*2+1,si].set_ylabel("Genes")
        if save !="":
            if save.endswith(".pdf") or save.endswith(".png") or save.endswith(".svg"):
                h, t=os.path.splitext(save)
                plt.savefig(h+"_plot"+t)
            else:
                plt.savefig(save+"_plot.pdf")
    else:
        fig, axes=plt.subplots(nrows=len(files)+1,ncols=len(pos),gridspec_kw={
                           'height_ratios': [2]*len(files)+[1]})
        
        if len(axes.shape)==1:
            axes=axes.reshape([-1,1])
        sample_order=sorted(mat.keys())
        for si, sample in enumerate(sample_order):
            vals=mat[sample]
            ymax=0
            #print(ymax)
            for posi, (genes, val) in enumerate(zip(geneset, vals)):
                if len(val)==0:
                    continue
                ax=axes[si,posi]
                chrom, s, e=pos[posi]
                val=np.array(val)
                vallen=val.shape[0]
                remains=vallen%step
                _e=e-remains
                x=np.arange(s, _e,step)
                
                #print(s, e, _e, vallen, step)
                val=val[:vallen-remains].reshape(-1, step).mean(axis=1)
                _ymax=np.amax(val)
                if _ymax >  ymax:
                    ymax =_ymax

                ax.fill_between(x, val,edgecolor='face')

                if posi==0:
                    ax.set_ylabel(sample)
    
                ax.set_xticks([],labels=[])
                if si==0:
                    ax.set_title(chrom+":"+str(s)+"-"+str(e))
                ax.grid(False)
                    
            occupied=[[] for i in range(8)]
            for posi in range(len(pos)):
                axes[si,posi].set_ylim(0,ymax*1.01)
                if posi >0:
                    axes[si,posi].set_yticks([],labels=[])
        
        for posi in range(len(pos)):
            interval=pos[posi][2]-pos[posi][1]
            for genename, gs, ge,ori in geneset[posi]:
                slot=0
                for oi in range(len(occupied)):
                    oc=False
                    if len(occupied[oi])==0:
                        occupied[oi].append([gs, ge])
                        oc=False
                        slot=oi
                    else:
                        for tmps, tmpe in occupied[oi]:
                            if tmps<=gs<= tmpe or tmps<=ge<=tmpe:
                                oc=True
                        if oc==False:
                            occupied[oi].append([gs, ge])
                            slot=oi
                    if oc==False:
                        break
                if gs < pos[posi][1]:
                    gs=pos[posi][1]
                if ge > pos[posi][2]:
                    ge=pos[posi][2]
                axes[si+1,posi].plot([gs, ge], [slot*1,slot*1], color="gray")
                if ori=="+":
                    axes[si+1,posi].plot([ge-interval/32, ge], [slot*1+0.1,slot*1], color="gray")
                elif ori=="-":
                    axes[si+1,posi].plot([gs, gs+interval/32], [slot*1,slot*1+0.1], color="gray")
                
                axes[si+1,posi].text((gs+ge)/2, slot*1, genename, ha="center")
                #axes[si+1,posi].grid(False)
            for oi, ol in enumerate(occupied):
                if len(ol)==0:
                    maxslot=oi
                    break
            axes[si+1,posi].set_ylim(-0.5,maxslot+0.5)
            axes[si+1,posi].ticklabel_format(useOffset=False)
            axes[si+1,posi].set_yticks([],labels=[])
            
        
        plt.tight_layout(h_pad=-1,w_pad=-1)
        _save(save, "plot")
        
    return {"ax":axes,"values":mat,"genes":geneset,"positions":pos}


def plot_bigwig_correlation(files: dict, 
                            chrom: str="chr1",
                            step: int=1000,
                            palette: str="coolwarm",
                            figsize: list=[6,6],
                            show_val: bool=True,
                            clustermap_param: dict={},
                            peakfile: str="",
                            #show_val: bool=False,
                            show: bool=False,
                            n_jobs: int=-1, save: str="") -> Dict:
    
    """
    Calculate pearson correlations and draw a heatmap for bigwig files.  
    
    Parameters
    ----------
    files : dict
        A dictionary whose keys are sample names and values are bigwig file names 
    chrom : str, optional
        A chromsome to cal
    step: int, optional
        A bin size to reduce the bigwig signal resolution.
    palette: str, optional
    show_val: bool, optional
        Whether or not to show correlation values on the heatmap
    peakfile: str, optional
        A peak file/bed file containing genome regions of interest. If given this option, the correlation will be calculated using the genome regions listed in this file.
    show : bool, optional
        Whether or not to show the figure.
    n_jobs : int, optional
        The number of threads to use. It will use all available threads by default. 
    Returns
    -------
    {"ax":axes,"values":mat,"genes":geneset,"positions":pos} : dict
        A dictionary containing matplot axes, signal values, geneset, and positions.
    Raises
    ------
    Notes
    -----
    References
    ----------
    See Also
    --------
    Examples
    --------
    """
    for k, v in files.items():
        assert type(v)==str, "filenames must be str, but {} was give.".format(type(v))
        if not os.path.isfile(v)==True:
            raise Exception("{} does not exist.".format(v))
    
    mat=[]
    samples=[]
    bw=pwg.open(list(files.values())[0])
    chrom_sizes=bw.chroms()
    
    
        
    
    if peakfile!="":
        # peaks=read_peaks(peakfile)
        # for k, f in files.items():
        #     bw=pwg.open(f)
        #     tmp=[]
        #     for chrom, se in peaks.items():
        #         for s, e in se:
        #             val=bw.stats(chrom,s,e, exact=True)[0]
        #             if val==None:
        #                 val=0
        #         tmp.append(val)
        #     mat.append(tmp)
        #     samples.append(k)
        peaks=_read_peaks(peakfile)
        for k, f in files.items():
            bw=pwg.open(f)
            
            tmp=Parallel(n_jobs=n_jobs)(delayed(_read_bw_stats)(chrom, s, e, f) for chrom, s, e in peaks)
            mat.append(tmp)
            samples.append(k)
        
    elif chrom !="all":
        chrom_size=chrom_sizes[chrom]
        
        samples=[]
        mat=[]
        #samples=list(files.keys())
        #mat=Parallel(n_jobs=n_jobs)(delayed(read_bw)(files[s],chrom, chrom_size, step) for s in samples)
        for k, f in files.items():
            bw=pwg.open(f)
            val=np.array(bw.values(chrom,0,chrom_size))
            rem=chrom_size%step
            val=val[:val.shape[0]-rem]
            print(val.shape)
            val=np.nan_to_num(val)
            val=val.reshape([-1,step]).mean(axis=1)
            print(val.shape)
            mat.append(val/np.sum(val))
            samples.append(k)
    elif chrom=="all":
        chroms=chrom_sizes.keys()
        for k, f in files.items():
            
            #tmp=Parallel(n_jobs=n_jobs)(delayed(read_bw)(f, chrom, 0, chrom_sizes[chrom], f) for chrom, s, e in chroms)
            bw=pwg.open(f)
            tmp=[]
            for chrom in chroms:
                chrom_size=chrom_sizes[chrom]
                val=np.array(bw.values(chrom,0,chrom_size))
                rem=chrom_size%step
                val=val[:val.shape[0]-rem]
                val=np.nan_to_num(val)
                val=val.reshape([-1,step]).mean(axis=1)
                tmp.append(val/np.sum(val))
            
            samples.append(k)
            mat.append(np.concatenate(tmp))
    else:
        raise Exception("please specify chrom or peakfile options.")
    
    mat=np.array(mat)
    dmat=Parallel(n_jobs=-1)(delayed(_calc_pearson)(ind, mat) for ind in list(it.combinations(range(mat.shape[0]), 2)))
    dmat=np.array(dmat)
    get_reusable_executor().shutdown(wait=True)
    dmat=squareform(dmat)
    print(dmat)
    dmat+=np.identity(dmat.shape[0])
    g=sns.clustermap(data=dmat,xticklabels=samples,yticklabels=samples,
               method="ward", cmap=palette,
               col_cluster=True,
               row_cluster=True,
               figsize=figsize,
               rasterized=True,
               #cbar_kws={"label":"Pearson correlation"}, 
               annot=show_val,
               **clustermap_param)
    g.cax.set_ylabel("Pearson correlation", rotation=-90,va="bottom")
    plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0)  # For y axis
    plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90) # For x axis
    
    
    _save(save, "correlation")
    if show:
        plt.show()
    return {"correlation": dmat,"samples":samples}
    
def call_superenhancer(bigwig: str, 
                       bed: str,
                       stitch=25000,
                       tss_dist: int=0,
                       plot_signals=False,
                       gff: str="",
                       closest_genes: bool=False,
                       nearest_k: int=1,
                       go_analysis: bool=False,
                       gonames: list=["GO_Biological_Process_2021","Reactome_2022","WikiPathways_2019_Human"],
                       sample: str="Sample",
                       n_jobs: int=12,
                       save: str="")->Dict:
    """
    Find super enhancers and plot an enhancer rank.  
    
    Parameters
    ----------
    bigwig : str
        A bigwig file.
    peakfile : str
        A peak file.
    tss_dist: int, optional
        A distance from transcriptional start sites. If this option is more than 0, you need provide gff option too.
    stitch: int, optional
        The maximum distance of enhancers to stitch together.
    plot_signals : bool, optional
        Drawing super enhancer regions +- 10kb
    gff: str, optional (required when tss_dist is provided)
        A gff or gtf file that contains gene annotations. You need to chose a correct genome version of a gff/gtf file 
    
    closest_genes: bool=False,
        If this is True, closest genes of SE will be shown, instead of genomic regions. You can specify the number 
        of closest genes by "nearest_k" option.
    nearest_k: int, optional
        The number of closest genes to be included. Default: 1.
        
    go_analysis: bool, optional
        GSEA analysis based on the rank of SEs.
        
    gonames: list=["GO_Biological_Process_2021","Reactome_2022","WikiPathways_2019_Human"]
        Gene sets for GSEA analysis.
    
    sample: str,optional
        The sample name to appear in the signal plot
    
    Returns
    -------
        dict {"ax": ax, "positions":pos, "signals":y,"nearest_genes":_nearestgenes}
    Raises
    ------
    Notes
    -----
    References
    ----------
    See Also
    --------
    Examples
    --------
    """ 
    
    bw=pwg.open(bigwig)
    chrom_sizes=bw.chroms()
    peaks=_read_peaks(bed)
    #stitched=stitching(peaks, stitch)
    
    
    if tss_dist != 0:
        start_time=time.time()
        stitched=pr.from_dict(_stitching_for_pyrange(peaks, stitch))
        
        if gff.endswith("gff") or gff.endswith("gff3"):
            gff_dict=_readgff(gff, "transcript",tss_dist)
        elif gff.endswith("gtf"):
            gff_dict=_readgtf(gff, "transcript",tss_dist)
        else:
            raise Exception("The gff/gtf file name is required to have either gff, gff3, or gtf extension.")
        gffr=pr.from_dict(gff_dict)
        
        _stitched=stitched.subtract(gffr, nb_cpu=n_jobs)
        stitched={}
        for chrom, s, e in zip(_stitched.Chromosome, _stitched.Start, _stitched.End):
            if not chrom in stitched:
                stitched[chrom]=[]
            stitched[chrom].append([s,e])
        print("removing enhancers around TSS took ", time.time()-start_time)
        # tss_pos=read_tss(tss,tss_dist)
        # stitched=remove_close_to_tss(stitched, tss_pos)
    else:
        stitched=_stitching(peaks, stitch)
        
        
    """Obtaining closest genes"""
    _nearestgenes=[]
    if closest_genes==True:
        start_time=time.time()
        if gff.endswith("gff") or gff.endswith("gff3"):
            gff_dict=_readgff2(gff, "gene", others=["gene_name"])
        elif gff.endswith("gtf"):
            gff_dict=_readgtf2(gff, "transcript", others=["gene_name"])
        else:
            raise Exception("The gff/gtf file name is required to have either gff, gff3, or gtf extension.")
        gffr=pr.from_dict(gff_dict)
        # grlist=[]
        # for chrom_se in pos:
        #     chrom, se=chrom_se.split(":")
        #     s, e=se.split("-")
        #     grlist.append(pr.from_dict({"Chromosome":[chrom],"Start":[int(s)],"End":[int(e)]}))
        # if nearest_k==1:
        #     _nearestgenes=Parallel(n_jobs=-1)(delayed(gr.nearest)(gffr) for gr in grlist)
        # else:
        #     _nearestgenes=Parallel(n_jobs=-1)(delayed(gr.k_nearest)(gffr, k=nearest_k) for gr in grlist)
        grlist={"Chromosome":[],"Start":[],"End":[]}
        stiched_dit={}
        for chrom, se in stitched.items():
            for s, e in se:
                stiched_dit[chrom+":"+str(s)+"-"+str(e)]=[]
                #grlist.append(pr.from_dict({"Chromosome":[chrom],"Start":[int(s)],"End":[int(e)]}))
                grlist["Chromosome"].append(chrom)
                grlist["Start"].append(int(s))
                grlist["End"].append(int(e))
                
        gr=pr.from_dict(grlist)
        _nearestgenes_gr=gr.k_nearest(gffr, k=nearest_k, nb_cpu=n_jobs)
        # _nearestgenes=[]
        # tmp=[]
        for i, (chrom, s, e, gene_name) in enumerate(zip(_nearestgenes_gr.Chromosome, _nearestgenes_gr.Start,_nearestgenes_gr.End,_nearestgenes_gr.gene_name)):
            stiched_dit[chrom+":"+str(s)+"-"+str(e)].append(gene_name)
        #
        #     tmp.append(gene_name)
        #     if i%nearest_k==0 and i>0:
        #         _nearestgenes.append(tmp)
        #         tmp=[]
        # _nearestgenes.append(tmp)
        print("Finding the nearest genes took ", time.time()-start_time)
        
    start_time=time.time()
    mat=[]
    pos=[]
    for chrom, se in stitched.items():
        for s, e in se:
            #print(chrom,s,e)
            val=bw.stats(chrom,s,e, type="sum", exact=True)[0]
            if val==None:
                val=0
            mat.append(val)
            pos.append(chrom+":"+str(s)+"-"+str(e))
            
    x, y, pos, sindex, srt=_find_extremes(mat, pos)
    b=-x[sindex]*np.amax(y) +y[sindex]
    print("Finding super enhancer took ", time.time()-start_time, "sec")
    
    # print(len(_nearestgenes), _nearestgenes[:10])
    # print(srt.shape, srt[:10])
    _nearestgenes=[stiched_dit[chromse] for chromse in pos]
    
        
    fig, ax=plt.subplots(figsize=[6,4])
    ax.scatter(x[:sindex], y[:sindex],s=5,rasterized=True)
    ax.scatter(x[sindex:], y[sindex:], color="r",s=5,rasterized=True)
    ax.plot(x,x*np.amax(y)+b, color="gray")
    
    if closest_genes==True:
        for j in range(5):
            ax.text(x[-j-1],y[-j-1], ",".join(_nearestgenes[-j-1]))
    else:
        for j in range(5):
            ax.text(x[-j-1],y[-j-1], pos[-j-1])
    ax.text(0,y[-1]*0.7, "SE No.: "+str( y[sindex:].shape[0])+"\nCutoff: "+str(np.round(y[sindex],3)))
    ax.set_xlabel("Enhancers")
    ax.set_ylabel("Signal sum")
    ax.set_ylim(0,max(y)*1.05)
    
    tick_=x.shape[0]//3
    if 100<tick_<=1000:
        tick_=500
    elif 1000<tick_<=5000:
        tick_=1000
    elif 5000<tick_<=10000:
        tick_=5000
    else:
        tick_=10000
    ax.set_xticks(np.arange(0,x.shape[0], tick_)/x.shape[0],labels=np.arange(0,x.shape[0], tick_))
    plt.ticklabel_format(axis="y", style="sci", scilimits=(0,0))
    
    _save(save, "rank")
    
    plt.subplots_adjust(right=0.620, bottom=0.130)
    
    
    stitched_out=os.path.splitext(bed)[0]+"_SE.bed"
    with open(stitched_out, "w") as fout:
        
        if closest_genes==True:
            rank=1
            for _pos, _sig, _gr in zip(reversed(pos[sindex:]), reversed(y[sindex:]),reversed(_nearestgenes[sindex:])):
                chrom, se=_pos.split(":")
                s, e=se.split("-")
                fout.write("\t".join([chrom,str(s),str(e), ",".join(_gr), str(_sig),str(rank)+"\n"]))
                rank+=1
        
        else:
            rank=1
            for _pos, _sig in zip(reversed(pos[sindex:]), reversed(y[sindex:])):
                chrom, se=_pos.split(":")
                s, e=se.split("-")
                fout.write("\t".join([chrom,str(s),str(e), str(_sig),str(rank)+"\n"]))
                rank+=1
    print("The SE file was written in "+stitched_out)
    if go_analysis==True:
        print("Now, doing GO anlyses..")
        start_time=time.time()
        try:
            import gseapy
        except ImportError:
            raise ImportError('If you want GO analysis, you need to install gseapy. Try "pip install gseapy".')
        godf={0:[],1:[]}
        y=(y-np.mean(y))/np.std(y)
        for _sig, genes in zip(reversed(y), reversed(_nearestgenes)):
            for gene in list(genes):
                godf[0].append(gene)
                godf[1].append(_sig)
        #print(godf)
        godf=pd.DataFrame(godf)
        godf=godf.set_index(0)
        for name in gonames:
            gs_res = gseapy.prerank(rnk=godf, # or data='./P53_resampling_data.txt'
                         gene_sets=name,
                             threads=4,
                             min_size=5,
                             max_size=1000,
                             permutation_num=100, # reduce number to speed up testing
                             outdir=None, # don't write to disk
                             seed=6,
                             verbose=True,) # see what's going on behind the scenes)
            #print(gs_res.res2d)
            for index, row in gs_res.res2d.iterrows():
                #print(row)
                if row["FDR q-val"] <0.05:
                    t=row.Term
                    gseapy.gseaplot(rank_metric=gs_res.ranking,
                             term=t,
                             cmap=plt.cm.seismic,
                             **gs_res.results[t],
                             ofname=os.path.splitext(bed)[0]+"_"+name+"_gsea.pdf"
                             )
        print("GO analysis took ", time.time()-start_time)
    if plot_signals:
        
        if gff=="":
            raise Exception("Please provide a gff file to draw signals.")
        letters = string.ascii_lowercase
        tmpfile=''.join(random.choice(letters) for i in range(10))+".bed"
        highlights=[]
        with open(tmpfile, "w") as fout:
            rank=1
            for pos, sig in zip(reversed(pos[sindex:]), reversed(y[sindex:])):
                chrom, se=pos.split(":")
                s, e=se.split("-")
                fout.write("\t".join([chrom,str(int(s)-10000),str(int(e)+10000), str(sig),str(rank)+"\n"]))
                highlights.append([chrom,int(s),int(e)])
                rank+=1
                if rank==6:
                    break
                    
        plot_bigwig(files={sample: bigwig}, 
                    bed=tmpfile, gff=gff, 
                    step=100,
                    stack_regions="vertical",
                    highlight=highlights,n_jobs=n_jobs, save=save)
        os.remove(tmpfile)
    return {"ax": ax, "positions":pos, "signals":y,"nearest_genes":_nearestgenes}


def plot_average(files: dict, 
                 bed: Union[str,list], 
                 order: list=[], extend: int=500, 
                 palette: str="coolwarm",
                 binsize: int=10,
                 clustering: str="",
                 n_clusters: int=5,
                 minval: Optional[float]=None,
                 orientaion=False,
                 save="",
                 n_jobs: int=-1):
    """
    Plotting bigwig files centered at peak regions.  
    
    Parameters
    ----------
    files : dict
        A dictionary whose keys are sample names and values are file names 
    bed : str
        A bed file name containing the list of genomic regions
    order : list
        A sample order to plot. if a clustering option (except kmeans_all) is chosen, the first sample is the target of clustering analysis. 
    binsize: int
        A bin size to reduce the bigwig signal resolution.
    clustering : str
        Availables: ["kmeans", "kmeans_all", "kmeans_auto"]
    n_clusters : int
        the number of clusters for clustering analysis. if you chose kmeans_auto, this option will be ignored.
    Returns
    -------
        dict {"values":data,"potisons":pos,"labels":labels}
        values: dictionary containing sample names as the keys and signals as the values
        positions: genomic positions sorted by signals displayed in the plot
        labels: list of clustering labels if kmean clustering was performed
        
    Raises
    ------
    Notes
    -----
    References
    ----------
    See Also
    --------
    Examples
    --------
    """
    if len(order)==0:
        order=list(files.keys())
    for k, v in files.items():
        assert type(v)==str, "filenames must be str, but {} was give.".format(type(v))
        if not os.path.isfile(v)==True:
            raise Exception("{} does not exist.".format(v))
            
    
    
    bw=pwg.open(files[order[0]])
    chrom_sizes=bw.chroms()
    
    if type(bed)==str:
        
        pos=[]
        bedchroms=set()
        with open(bed) as fin:
            for l in fin:
                if l.startswith("#"):
                    continue
                l=l.split()
                chrom, s, e=l[0],l[1],l[2]
                
                if not chrom in chrom_sizes:
                    raise Exception("There may be mismatches in chromosome names between the bed file and bigwig files.\n\
                the chromosome names in the bed file are: {}\n\
                those in the bigwig files are: {}".format(chrom, chrom_sizes.keys())) 
                
                s, e=int(s.replace(",","")),int(e.replace(",",""))
                center=(s+e)//2
                cs=center-extend
                ce=center+extend
                if cs <0 or chrom_sizes[chrom] < ce:
                    continue
                pos.append([chrom,cs,ce])
                bedchroms.add(chrom)
    else:
        pos=bed
    
    
    data={}
    data_mean=[]
    
    for i, sample in enumerate(order):
        bigwig=files[sample]
        data[sample]=[]
        vals, mean=zip(*Parallel(n_jobs=n_jobs, prefer="threads")(delayed(_read_and_reshape_bw)(chrom, s, e, bigwig, binsize) for chrom, s, e in pos))
        
        
        data[sample]=np.nan_to_num(vals)
        if i==0:
            data_mean=mean
        # for chrom, s, e in pos:
        #
        #     val=bw.values(chrom, s, e)
        #     #print(val)
        #     val=np.array(val)
        #     #print(val.shape)
        #     val=val.reshape([-1,binsize]).mean(axis=1)
        #     #print(val.shape)
        #     data[sample].append(val)
        #     if i==0:
        #         data_mean.append(bw.stats(chrom, s, e, exact=True)[0])
    labels=[]
    if clustering=="kmeans_all":
        mat=[]
        for sample in order:
            #print(np.array(data[sample]).shape)
            mat.append(data[sample])
        mat=np.concatenate(mat, axis=1)
        #print(mat.shape)
        mat=zscore(mat, axis=0)
        pca=PCA(n_components=5, random_state=1)
        xpca=pca.fit_transform(mat)
        kmean = KMeans(n_clusters=n_clusters, random_state=0,n_init=10)
        kmX=kmean.fit(xpca)
        labels=kmX.labels_
    elif clustering=="kmeans":
        mat=np.array(data[order[0]])
        kmean = KMeans(n_clusters=n_clusters, random_state=0,n_init=10)
        kmX=kmean.fit(mat)
        labels=kmX.labels_
    elif clustering=="kmeans_auto":
        mat=np.array(data[order[0]])
        optimalclusternum=_optimal_kmeans(mat, [2, 10])
        n_clusters=np.amax(optimalclusternum)
        kmean = KMeans(n_clusters=n_clusters, random_state=0,n_init=10)
        kmX=kmean.fit(mat)
        labels=kmX.labels_
        
    sortindex=np.argsort(data_mean)[::-1]
    pos=np.array(pos)[sortindex]

    plotindex=np.arange(0, len(pos), len(pos)//1000)
    fig, axes=plt.subplots(ncols=len(order), figsize=[2*len(order), 6])
    if len(order)==1:
        axes=[axes]
    for i, (sample, ax) in enumerate(zip(order, axes)):
        vals=data[sample]

        vals=np.array(vals)

        vals=vals[sortindex]
        if clustering!="":
            
            _labels=np.array(labels)[sortindex]
            sortindex2=np.argsort(_labels)
            vals=vals[sortindex2]
            _pos=pos[sortindex2]
            _labels=_labels[sortindex2]
            _labels=_labels[plotindex]
            ulabel, clabel=np.unique(_labels, return_counts=True)

        vals=vals[plotindex]
        if np.sum(vals)<=10**-6:
            raise Exception("Signals may be empty, or you may be using a wrong bed file.")
        if np.amax(vals) <= 1  and np.amin(vals)>=0:
            im=ax.imshow(vals,aspect="auto", cmap=palette, interpolation="none",norm=LogNorm(vmin=0.00001))
        elif np.amin(vals) < 0:
            vals=np.where(vals < 0, 0, vals)
            warnings.warn("The bigwig contains negative values. Is this coverage file?")
            im=ax.imshow(vals,aspect="auto", cmap=palette, interpolation="none",norm=LogNorm(vmin=1))
        elif minval!=None:
            im=ax.imshow(vals,aspect="auto", cmap=palette, interpolation="none",norm=LogNorm(vmin=minval))
        else:
            im=ax.imshow(vals,aspect="auto", cmap=palette, interpolation="none",norm=LogNorm(vmin=1))
        
        
        
        cbaxes = fig.add_axes([0.8*(i+1)/len(order), 0.93, 0.03, 0.06]) 
        cb = plt.colorbar(im, cax = cbaxes)
        
            
        # print(np.arange(0, vals.shape[1]+vals.shape[1]//4, vals.shape[1]//4))
        # print(np.arange(-extend, extend+extend//2, extend//2))
        ax.set_xticks(np.arange(0, vals.shape[1]+vals.shape[1]//4, vals.shape[1]//4),labels=np.arange(-extend, extend+extend//2, extend//2))
        ax.set_xlabel("Peak range [bp]")
        ax.set_title(sample)
        
        if clustering!="":
            total=0
            isep=0
            for label, count in zip(ulabel, clabel):
                
                total+=count
                if isep==ulabel.shape[0]-1:
                    break
                ax.plot([0, vals.shape[1]-1], [total,total], color="black")
                isep+=1
        ax.set_yticks([])
        ax.grid(False)
    _save(save, "heatmap")
    fig, axes2=plt.subplots(ncols=len(order), figsize=[4*len(order), 3])
    if len(order)==1:
        axes2=[axes2]
    maxval=0
    for i, (sample, ax) in enumerate(zip(order, axes2)):
        vals=data[sample]
        vals=np.array(vals)
        
        labels=np.array(labels)
        if clustering !="":
            for label, count in zip(ulabel, clabel):
                
                _vals=np.mean(vals[labels==label],axis=0)
                #print(_vals.shape)
                #print(vals.shape)
                ax.plot(np.arange(-extend, extend, 2*extend//(_vals.shape[0])),_vals, label=label)
                _maxval=np.max(_vals)
                if maxval < _maxval:
                    maxval=_maxval
            plt.legend()
        else:
            vals=vals.mean(axis=0)
            ax.plot(np.arange(-extend, extend, 2*extend//(vals.shape[0])),vals)
            _maxval=np.max(vals)
            if maxval < _maxval:
                maxval=_maxval 
        if i==0:
            ax.set_ylabel("Mean signal value")
        # print(np.arange(0, vals.shape[1]+vals.shape[1]//4, vals.shape[1]//4))
        # print(np.arange(-extend, extend+extend//2, extend//2))
        #ax.set_xticks(np.arange(0, vals.shape[1]+vals.shape[1]//4, vals.shape[1]//4),labels=np.arange(-extend, extend+extend//2, extend//2))
        ax.set_xlabel("Peak range [bp]")
        ax.set_title(sample)
    
    for ax in axes2:
        ax.set_ylim(0, maxval*1.05)
    
    _save(save, "average")
    
    return {"values":data,"potisons":_pos,"labels":labels, "axes":axes,"axes2":axes2}



def _plot_genebody(files: dict,
                files_minus: Optional[dict]=None,
                 bed: Union[list, str]="",
                 gff: str="",
                 centered_at: str="TSS", 
                 order: list=[], 
                 extend: int=3000, 
                 palette: str="coolwarm",
                 binsize: int=10,
                 clustering: str="",
                 n_clusters: int=5,
                 save: str="",lognorm=True,n_jobs=-1)-> Dict:
    
    """
    Plotting bigwig files around gene bodies.  
    
    Parameters
    ----------
    files : dict
        A dictionary whose keys are sample names and values are file names 
    bed : str
        A bed file name containing the list of genomic regions
    order : list
        A sample order to plot. if a clustering option (except kmeans_all) is chosen, the first sample is the target of clustering analysis. 
    binsize: int
        A bin size to reduce the bigwig signal resolution.
    clustering : str
        Availables: ["kmeans", "kmeans_all", "kmeans_auto"]
    n_clusters : int
        the number of clusters for clustering analysis. if you chose kmeans_auto, this option will be ignored.
    Returns
    -------
        dict {"values":data,"potisons":pos,"labels":labels, "axes":axes, "axes2":axes2}
        values: dictionary containing sample names as the keys and signals as the values
        positions: genomic positions sorted by signals displayed in the plot
        labels: list of clustering labels if kmean clustering was performed
        
    Raises
    ------
    Notes
    -----
    References
    ----------
    See Also
    --------
    Examples
    --------
    """
    if len(order)==0:
        order=list(files.keys())
    for k, v in files.items():
        assert type(v)==str, "filenames must be str, but {} was give.".format(type(v))
        if not os.path.isfile(v)==True:
            raise Exception("{} does not exist.".format(v))
    bw=pwg.open(files[order[0]])
    chrom_sizes=bw.chroms()
    
    if len(bed)>0 and len(gff)>0:
        warnings.warn("Ignoring the gff file, since the bed file is provided.")
    
    if len(bed)>0:
        oriset=set(["+", "-"])
        bedchroms=set()
        if type(bed)==str and len(bed)>0:
            with open(bed) as fin:
                for l in fin:
                    l=l.split()
                    if not l[-1] in oriset:
                        raise Exception("The bed file must contain orientation information at the last column. e.g., chr1\t10000\t20000\t.\t.\t+")
                    
                    if not l[0] in chrom_sizes:
                        raise Exception("There may be mismatches in chromosome names between the bed file and bigwig files.\n\
                    the chromosome names in the bed file are: {}\n\
                    those in the bigwig files are: {}".format(l[0], chrom_sizes.keys())) 
            posplus=[]
            posminus=[]
            seen=set()
            with open(bed) as fin:
                for l in fin:
                    l=l.split()
                    chrom, s, e, ori=l[0],l[1],l[2],l[-1]
                    tmp="_".join([chrom, s, e, ori])
                    if not tmp in seen:
                        seen.add(tmp)
                    else:
                        continue
                    
                    bedchroms.add(chrom)
                    if ori=="+":
                        cs=int(s)-extend
                        ce=int(s)+extend
                        if cs <0 or chrom_sizes[chrom] < ce:
                            continue
                        posplus.append([chrom,cs,ce])
                    else:
                        cs=int(e)-extend
                        ce=int(e)+extend
                        if cs <0 or chrom_sizes[chrom] < ce:
                            continue
    
                        posminus.append([chrom,cs,ce])
        else:
            posplus=bed[0]
            posminus=bed[1]
    
    elif len(gff)>0:
        transcripts=_readgff_transcripts(gff, extend)
    
    
    if len(order)==0:
        order=list(files.keys())
    data={}
    data_mean=[]
    start_time=time.time()
    
    for i, sample in enumerate(order):
        bigwig=files[sample]
        
        if len(gff)>0:
            tid_list=[]
            pvals=[]
            tids=transcripts["+"].keys()
            tid_list.extend(tids)
            pvals=Parallel(n_jobs=n_jobs)(delayed(_read_transcripts)(bigwig, transcripts["+"][tid], binsize, extend)  for tid in tids)
            print(pvals[:10])
            if files_minus!=None:
                bigwig=files_minus[sample]
            mvals=[]
            tids=transcripts["-"].keys()
            mvals=Parallel(n_jobs=n_jobs)(delayed(_read_transcripts)(bigwig, transcripts["-"][tid], binsize, extend)  for tid in tids)
            tid_list.extend(tids)
            
        #bw=pwg.open(bigwig)
        else:
            pvals=[]
            pmean=[]
            data[sample]=[]
            pvals, pmean=zip(*Parallel(n_jobs=n_jobs)(delayed(_read_and_reshape_bw)(chrom, s, e, bigwig, binsize)  for chrom, s, e in posplus))
            
            if files_minus!=None:
                bigwig=files_minus[sample]
                #bw=pwg.open(bigwig)
                mvals, mmean=zip(*Parallel(n_jobs=n_jobs)(delayed(_read_and_reshape_bw)(chrom, s, e, bigwig, binsize)  for chrom, s, e in posminus))
            else:
                mvals, mmean=zip(*Parallel(n_jobs=n_jobs)(delayed(_read_and_reshape_bw)(chrom, s, e, bigwig, binsize)  for chrom, s, e in posminus))
        
        mvals=np.flip(mvals, axis=1)
        vals=np.nan_to_num(np.concatenate([pvals,mvals]))
        print(vals.shape)
        if i ==0:
            sums=np.sum(vals, axis=1)
            remove=sums >np.quantile(sums, 0.1)
            print(np.sum(remove))
        vals=vals[remove]
        #vals=vals.reshape([-1,vals.shape[1]//binsize,binsize]).mean(axis=2)
        print(np.amax(sums),np.amin(sums),vals.shape)
        data[sample]=vals
        
        if i==0:
            data_mean=np.amax(vals,axis=1)
        # for chrom, s, e in posplus:
        #     val=bw.values(chrom, s, e)
        #     #print(val)
        #     val=np.array(val)
        #     #print(val.shape)
        #     val=val.reshape([-1,binsize]).mean(axis=1)
        #     #print(val.shape)
        #     data[sample].append(val)
        #     if i==0:
        #         mval=bw.stats(chrom, s, e, exact=True)[0]
        #         if mval==None:
        #             data_mean.append(0)
        #         else:
        #             data_mean.append(mval)
        #
        # for chrom, s, e in posminus:
        #     chromsize=chrom_sizes[chrom]
        #     val=bw.values(chrom, s, e)
        #     #print(val)
        #     val=np.array(val)[::-1]
        #     #print(val.shape)
        #     val=val.reshape([-1,binsize]).mean(axis=1)
        #     #print(val.shape)
        #     data[sample].append(val)
        #     if i==0:
        #         mval=bw.stats(chrom, s, e, exact=True)[0]
        #         if mval==None:
        #             data_mean.append(0)
        #         else:
        #             data_mean.append(mval)
    print("Reading bigwig file took ", time.time()-start_time)
    labels=[]
    if clustering=="kmeans_all":
        mat=[]
        for sample in order:
            print(np.array(data[sample]).shape)
            mat.append(data[sample])
        mat=np.concatenate(mat, axis=1)
        mat=np.nan_to_num(mat)
        print(mat.shape)
        mat=zscore(mat, axis=0)
        pca=PCA(n_components=5, random_state=1)
        xpca=pca.fit_transform(mat)
        kmean = KMeans(n_clusters=n_clusters, random_state=0,n_init=10)
        kmX=kmean.fit(xpca)
        labels=kmX.labels_
    elif clustering=="kmeans":
        mat=np.nan_to_num(np.array(data[order[0]]))
        kmean = KMeans(n_clusters=n_clusters, random_state=0,n_init=10)
        kmX=kmean.fit(mat)
        labels=kmX.labels_
    elif clustering=="kmeans_auto":
        mat=np.nan_to_num(np.array(data[order[0]]))
        optimalclusternum=_optimal_kmeans(mat, [2, 10])
        n_clusters=np.amax(optimalclusternum)
        kmean = KMeans(n_clusters=n_clusters, random_state=0,n_init=10)
        kmX=kmean.fit(mat)
        labels=kmX.labels_
    if len(gff) >0:
        pos=np.array(tid_list)
    else:
        pos=np.array(posplus+posminus)[remove]
    #data_mean=np.nan_to_num(data_mean)
    sortindex=np.argsort(data_mean)[::-1]
    print(sortindex)
    pos=pos[sortindex]
    print(pos[:20])
    plotindex=np.arange(0, len(pos), len(pos)//1000)
    fig, axes=plt.subplots(ncols=len(order), figsize=[2*len(order), 6])
    if len(order)==1:
        axes=[axes]
    for i, (sample, ax) in enumerate(zip(order, axes)):
        vals=data[sample]
        
        vals=np.array(vals)
        
        vals=vals[sortindex]
        vals=vals/np.amax(vals, axis=1)[:,None]
        ax.plot(vals[0])
        ax.plot(vals[1])
        ax.plot(vals[2])
        plt.show()
        sys.exit()
        

        if clustering!="":
            
            _labels=np.array(labels)[sortindex]
            sortindex2=np.argsort(_labels)
            vals=vals[sortindex2]
            _pos=pos[sortindex2]
            _labels=_labels[sortindex2]
            _labels=_labels[plotindex]
            ulabel, clabel=np.unique(_labels, return_counts=True)

        vals=vals[plotindex]
        print(vals.shape)
        if lognorm==True:
            
            im=ax.imshow(vals,aspect="auto", cmap=palette, interpolation="none",norm=LogNorm(vmin=np.amax([1, np.quantile(sums,0.5)])))
        else:
            im=ax.imshow(vals,aspect="auto", cmap=palette, interpolation="none")
        cbaxes = fig.add_axes([0.8*(i+1)/len(order), 0.93, 0.03, 0.06]) 
        cb = plt.colorbar(im, cax = cbaxes)
        
            
        # print(np.arange(0, vals.shape[1]+vals.shape[1]//4, vals.shape[1]//4))
        # print(np.arange(-extend, extend+extend//2, extend//2))
        ax.set_xticks(np.arange(0, vals.shape[1]+vals.shape[1]//4, vals.shape[1]//4),labels=np.arange(-extend, extend+extend//2, extend//2))
        ax.set_xlabel("Peak range [bp]")
        ax.set_title(sample)
        
        if clustering!="":
            total=0
            isep=0
            for label, count in zip(ulabel, clabel):
                
                total+=count
                if isep==ulabel.shape[0]-1:
                    break
                ax.plot([0, vals.shape[1]-1], [total,total], color="black")
                isep+=1
        ax.set_yticks([])
        ax.grid(False)
        
    _save(save, "heatmap")

    fig, axes2=plt.subplots(ncols=len(order), figsize=[4*len(order), 3])
    if len(order)==1:
        axes2=[axes2]
    maxval=0
    for i, (sample, ax) in enumerate(zip(order, axes2)):
        vals=data[sample]
        vals=np.array(vals)
        labels=np.array(labels)
        if clustering !="":
            for label, count in zip(ulabel, clabel):
                
                _vals=np.mean(vals[labels==label],axis=0)
                print(_vals.shape)
                print(vals.shape)
                ax.plot(np.arange(-extend, extend, 2*extend//(_vals.shape[0])),_vals, label=label)
                _maxval=np.max(_vals)
                if maxval < _maxval:
                    maxval=_maxval
            plt.legend()
        else:
            vals=vals.mean(axis=0)
            ax.plot(np.arange(-extend, extend, 2*extend//(vals.shape[0])),vals)
            _maxval=np.max(vals)
            if maxval < _maxval:
                maxval=_maxval 
        if i==0:
            ax.set_ylabel("Mean signal value")
        # print(np.arange(0, vals.shape[1]+vals.shape[1]//4, vals.shape[1]//4))
        # print(np.arange(-extend, extend+extend//2, extend//2))
        #ax.set_xticks(np.arange(0, vals.shape[1]+vals.shape[1]//4, vals.shape[1]//4),labels=np.arange(-extend, extend+extend//2, extend//2))
        ax.set_xlabel("Peak range [bp]")
        ax.set_title(sample)
    
    for ax in axes:
        ax.set_ylim(0, maxval*1.05)
    _save(save, "average")
    return {"values":data,"potisons":pos,"labels":labels, "axes":axes, "axes2":axes2}

def plot_bed_correlation(files:dict,
                         method: str="pearson",
                         palette: str="coolwarm",
                         figsize: list=[6,6], 
                         show_val: bool=True, 
                         clustermap_param: dict={},
                         step: int=100,
                         show: bool=False):
    for k, v in files.items():
        assert type(v)==str, "filenames must be str, but {} was give.".format(type(v))
        if not os.path.isfile(v)==True:
            raise Exception("{} does not exist.".format(v))
    bedintervals={}
    minmax={}
    for sample, bedfile in files.items():
        bedintervals[sample]={}
        with open(bedfile) as fin:
            for l in fin:
                if l.startswith("#"):
                    continue
                l=l.split()
                chrom=l[0]
                s, e=int(l[1]), int(l[2])
                if not chrom in bedintervals[sample]:
                    bedintervals[sample][chrom]=[]
                if not chrom in minmax:
                    minmax[chrom]=[]
                bedintervals[sample][chrom].append([s,e])
                minmax[chrom].append(s)
                minmax[chrom].append(e)
    minmaxval={}
    for chrom, se in minmax.items():
        minmaxval[chrom]=[np.min(se), np.max(se)]
    
    mat=[]
    samples=[]
    for sample, chrom_intervals in bedintervals.items():
        print(sample)
        tmp=[]
        for chrom, intervals in chrom_intervals.items():
            tmp2=[0 for _ in range(minmaxval[chrom][0], minmaxval[chrom][1],step)]
            for s, e in intervals:
                s=s-minmaxval[chrom][0]
                e=e-minmaxval[chrom][0]
                for i in range(s, e,step):
                    tmp2[i//step]=1
            tmp.extend(tmp2)
        mat.append(tmp)
        samples.append(sample)
    mat=np.array(mat)
    mat = mat[:, np.any(mat, axis=0)]
    print(mat[:100,:100])
    if method=="pearson":
        #dmat=Parallel(n_jobs=-1)(delayed(_calc_pearson)(ind, mat) for ind in list(it.combinations(range(mat.shape[0]), 2)))
        dmat=np.corrcoef(mat)
        dmat=np.array(dmat)
        #dmat=squareform(dmat)
        print(dmat)
        #dmat+=np.identity(dmat.shape[0])
    else:
        dmat=squareform(pdist(mat, method))
    
            
    
    g=sns.clustermap(data=dmat,xticklabels=samples,yticklabels=samples,
               method="ward", cmap=palette,
               col_cluster=True,
               row_cluster=True,
               figsize=figsize,
               rasterized=True,
               #cbar_kws={"label":"Pearson correlation"}, 
               annot=show_val,
               **clustermap_param)
    if method=="pearson":
        title="Pearson correlation"
    else:
        title=method+" distance"
    g.cax.set_ylabel(title, rotation=-90,va="bottom")
    plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0)  # For y axis
    plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90) # For x axis
    if show:
        plt.show()
        
    return {"correlation": dmat,"samples":samples}
    
    
    
    
def frip_calc(files:dict,):
    
    pass
    
    
    
    

if __name__=="__main__":
    #test="plot_bigwig"
    test="plot_bigwig_correlation"
    
    #test="plot_bigwig"
    test="plot_bed_correlation"
    
    test="plot_average"
    #test="plot_genebody"
    #test="call_superenhancer"
    #test="plot_genebody"
    import glob
    
    if test=="plot_bed_correlation":
        fs= {"KMT2A":"/media/koh/grasnas/home/data/omniplot/KMT2A_ENCFF644EPI_srt.bed",
             "KMT2B":"/media/koh/grasnas/home/data/omniplot/HepG2_KMT2B_ENCFF036WCY_srt.bed",
             "HNF4G_rep1":"/media/koh/grasnas/home/data/omniplot/HNF4G_ENCFF413UQM_srt.bed",
             "HNF4G_rep2":"/media/koh/grasnas/home/data/omniplot/HNF4G_ENCFF123FQW_srt.bed",
             "HNF1A_rep1":"/media/koh/grasnas/home/data/omniplot/HNF1A_ENCFF696TGC_srt.bed",
             "HNF1A_rep2":"/media/koh/grasnas/home/data/omniplot/HNF1A_ENCFF162SGM_srt.bed",}
        
        plot_bed_correlation(fs,step=5000, method="correlation")
        plt.show()
    elif test=="plot_bigwig":
        fs= {"KMT2A":"/media/koh/grasnas/home/data/omniplot/HepG2_KMT2A-human_ENCFF406SHU.bw",
             "KMT2B":"/media/koh/grasnas/home/data/omniplot/HepG2_KMT2B-human_ENCFF709UTL.bw"}
        bed="/media/koh/grasnas/home/data/omniplot/tmp3.bed"
        gff="/media/koh/grasnas/home/data/omniplot/gencode.v40.annotation.gff3"
        
        plot_bigwig(fs,bed,gff, step=10)
        plt.show()
    elif test=="plot_bigwig_correlation":
        fs= {"KMT2A":"/media/koh/grasnas/home/data/omniplot/HepG2_KMT2A-human_ENCFF406SHU.bw",
             "KMT2B":"/media/koh/grasnas/home/data/omniplot/HepG2_KMT2B-human_ENCFF709UTL.bw",
             "HNF1A_rep1":"/media/koh/grasnas/home/data/omniplot/HepG2_HNF1A-human_ENCFF397BTX.bw",
             "HNF1A_rep2":"/media/koh/grasnas/home/data/omniplot/HepG2_HNF1A-human_ENCFF502ACF.bw",
             "HNF4A_rep1":"/media/koh/grasnas/home/data/omniplot/HepG2_HNF4A-human_ENCFF467SMI.bw",
             "HNF4A_rep2":"/media/koh/grasnas/home/data/omniplot/HepG2_HNF4A-human_ENCFF712VYA.bw",}
        
        plot_bigwig_correlation(fs,step=1000)
        plt.show()
    elif test=="call_superenhancer":
        gff="/media/koh/grasnas/home/data/omniplot/gencode.v40.annotation.gff3"
        
        gff="/media/koh/grasnas/home/data/omniplot/hg38.refGene.gtf"
        gff="/media/koh/grasnas/home/data/omniplot/gencode.vM31.annotation.gff3"
        f="/media/koh/grasnas/home/data/omniplot/HepG2_KMT2B-human_ENCFF709UTL.bw"
        f="/media/koh/grasnas/home/data/omniplot/SRR620141_1_trimmed_srt_no_dups.bw"
        peak="/media/koh/grasnas/home/data/omniplot/HepG2_KMT2B_ENCFF036WCY_srt.bed"
        peak="/media/koh/grasnas/home/data/omniplot/peaks_mm39.bed"
        #tss="/media/koh/grasnas/home/data/omniplot/gencode.v40.annotation_tss_srt.bed"
        call_superenhancer(bigwig=f, bed=peak,tss_dist=5000,plot_signals=True , gff=gff,closest_genes=True,
                           go_analysis=False,nearest_k=3)
        plt.show()
    elif test=="plot_average":
        
        peak="/media/koh/grasnas/home/data/omniplot/HepG2_KMT2B_ENCFF036WCY_srt.bed"
        files={"KMT2B":"/media/koh/grasnas/home/data/omniplot/HepG2_KMT2B-human_ENCFF709UTL.bw",
                            "KMT2A":"/media/koh/grasnas/home/data/omniplot/HepG2_KMT2A-human_ENCFF406SHU.bw"}
        plot_average(files=files,
                            bed=peak,
                            order=["KMT2B",  "KMT2A"],clustering="kmeans"
                            )
        plt.show()
    elif test=="plot_genebody":

        gff="/media/koh/grasnas/home/data/omniplot/gencode.v40.annotation.gff3"
        files={"adrenal_grand":"/media/koh/grasnas/home/data/omniplot/adrenal_grand_plus_ENCFF809PGE.bigWig"}
        files_minus={"adrenal_grand":"/media/koh/grasnas/home/data/omniplot/adrenal_grand_minus_ENCFF896WDT.bigWig"}
        _plot_genebody(files=files,files_minus=files_minus,
                            gff=gff,binsize=1,
                            order=["adrenal_grand"]#,clustering="kmeans"
                            )
        plt.show()
    #raise NotImplementedError("This function will plot ChIP-seq data.")

Functions

def call_superenhancer(bigwig: str, bed: str, stitch=25000, tss_dist: int = 0, plot_signals=False, gff: str = '', closest_genes: bool = False, nearest_k: int = 1, go_analysis: bool = False, gonames: list = ['GO_Biological_Process_2021', 'Reactome_2022', 'WikiPathways_2019_Human'], sample: str = 'Sample', n_jobs: int = 12, save: str = '') ‑> Dict[~KT, ~VT]

Find super enhancers and plot an enhancer rank.

Parameters

bigwig : str: A bigwig file.
peakfile : str: A peak file.
tss_dist : int, optional: A distance from transcriptional start sites. If this option is more than 0, you need provide gff option too.
stitch : int, optional: The maximum distance of enhancers to stitch together.
plot_signals : bool, optional: Drawing super enhancer regions +- 10kb
gff : str, optional (required when tss_dist is provided): A gff or gtf file that contains gene annotations. You need to chose a correct genome version of a gff/gtf file
closest_genes : bool=False,: If this is True, closest genes of SE will be shown, instead of genomic regions. You can specify the number of closest genes by "nearest_k" option.
nearest_k : int, optional: The number of closest genes to be included. Default: 1.
go_analysis : bool, optional: GSEA analysis based on the rank of SEs.
gonames : list=["GO_Biological_Process_2021","Reactome_2022","WikiPathways_2019_Human"]: Gene sets for GSEA analysis.
sample : str,optional: The sample name to appear in the signal plot

Returns

dict {"ax": ax, "positions":pos, "signals":y,"nearest_genes":_nearestgenes}

Raises

Notes

References

Examples

Expand source code

def call_superenhancer(bigwig: str, 
                       bed: str,
                       stitch=25000,
                       tss_dist: int=0,
                       plot_signals=False,
                       gff: str="",
                       closest_genes: bool=False,
                       nearest_k: int=1,
                       go_analysis: bool=False,
                       gonames: list=["GO_Biological_Process_2021","Reactome_2022","WikiPathways_2019_Human"],
                       sample: str="Sample",
                       n_jobs: int=12,
                       save: str="")->Dict:
    """
    Find super enhancers and plot an enhancer rank.  
    
    Parameters
    ----------
    bigwig : str
        A bigwig file.
    peakfile : str
        A peak file.
    tss_dist: int, optional
        A distance from transcriptional start sites. If this option is more than 0, you need provide gff option too.
    stitch: int, optional
        The maximum distance of enhancers to stitch together.
    plot_signals : bool, optional
        Drawing super enhancer regions +- 10kb
    gff: str, optional (required when tss_dist is provided)
        A gff or gtf file that contains gene annotations. You need to chose a correct genome version of a gff/gtf file 
    
    closest_genes: bool=False,
        If this is True, closest genes of SE will be shown, instead of genomic regions. You can specify the number 
        of closest genes by "nearest_k" option.
    nearest_k: int, optional
        The number of closest genes to be included. Default: 1.
        
    go_analysis: bool, optional
        GSEA analysis based on the rank of SEs.
        
    gonames: list=["GO_Biological_Process_2021","Reactome_2022","WikiPathways_2019_Human"]
        Gene sets for GSEA analysis.
    
    sample: str,optional
        The sample name to appear in the signal plot
    
    Returns
    -------
        dict {"ax": ax, "positions":pos, "signals":y,"nearest_genes":_nearestgenes}
    Raises
    ------
    Notes
    -----
    References
    ----------
    See Also
    --------
    Examples
    --------
    """ 
    
    bw=pwg.open(bigwig)
    chrom_sizes=bw.chroms()
    peaks=_read_peaks(bed)
    #stitched=stitching(peaks, stitch)
    
    
    if tss_dist != 0:
        start_time=time.time()
        stitched=pr.from_dict(_stitching_for_pyrange(peaks, stitch))
        
        if gff.endswith("gff") or gff.endswith("gff3"):
            gff_dict=_readgff(gff, "transcript",tss_dist)
        elif gff.endswith("gtf"):
            gff_dict=_readgtf(gff, "transcript",tss_dist)
        else:
            raise Exception("The gff/gtf file name is required to have either gff, gff3, or gtf extension.")
        gffr=pr.from_dict(gff_dict)
        
        _stitched=stitched.subtract(gffr, nb_cpu=n_jobs)
        stitched={}
        for chrom, s, e in zip(_stitched.Chromosome, _stitched.Start, _stitched.End):
            if not chrom in stitched:
                stitched[chrom]=[]
            stitched[chrom].append([s,e])
        print("removing enhancers around TSS took ", time.time()-start_time)
        # tss_pos=read_tss(tss,tss_dist)
        # stitched=remove_close_to_tss(stitched, tss_pos)
    else:
        stitched=_stitching(peaks, stitch)
        
        
    """Obtaining closest genes"""
    _nearestgenes=[]
    if closest_genes==True:
        start_time=time.time()
        if gff.endswith("gff") or gff.endswith("gff3"):
            gff_dict=_readgff2(gff, "gene", others=["gene_name"])
        elif gff.endswith("gtf"):
            gff_dict=_readgtf2(gff, "transcript", others=["gene_name"])
        else:
            raise Exception("The gff/gtf file name is required to have either gff, gff3, or gtf extension.")
        gffr=pr.from_dict(gff_dict)
        # grlist=[]
        # for chrom_se in pos:
        #     chrom, se=chrom_se.split(":")
        #     s, e=se.split("-")
        #     grlist.append(pr.from_dict({"Chromosome":[chrom],"Start":[int(s)],"End":[int(e)]}))
        # if nearest_k==1:
        #     _nearestgenes=Parallel(n_jobs=-1)(delayed(gr.nearest)(gffr) for gr in grlist)
        # else:
        #     _nearestgenes=Parallel(n_jobs=-1)(delayed(gr.k_nearest)(gffr, k=nearest_k) for gr in grlist)
        grlist={"Chromosome":[],"Start":[],"End":[]}
        stiched_dit={}
        for chrom, se in stitched.items():
            for s, e in se:
                stiched_dit[chrom+":"+str(s)+"-"+str(e)]=[]
                #grlist.append(pr.from_dict({"Chromosome":[chrom],"Start":[int(s)],"End":[int(e)]}))
                grlist["Chromosome"].append(chrom)
                grlist["Start"].append(int(s))
                grlist["End"].append(int(e))
                
        gr=pr.from_dict(grlist)
        _nearestgenes_gr=gr.k_nearest(gffr, k=nearest_k, nb_cpu=n_jobs)
        # _nearestgenes=[]
        # tmp=[]
        for i, (chrom, s, e, gene_name) in enumerate(zip(_nearestgenes_gr.Chromosome, _nearestgenes_gr.Start,_nearestgenes_gr.End,_nearestgenes_gr.gene_name)):
            stiched_dit[chrom+":"+str(s)+"-"+str(e)].append(gene_name)
        #
        #     tmp.append(gene_name)
        #     if i%nearest_k==0 and i>0:
        #         _nearestgenes.append(tmp)
        #         tmp=[]
        # _nearestgenes.append(tmp)
        print("Finding the nearest genes took ", time.time()-start_time)
        
    start_time=time.time()
    mat=[]
    pos=[]
    for chrom, se in stitched.items():
        for s, e in se:
            #print(chrom,s,e)
            val=bw.stats(chrom,s,e, type="sum", exact=True)[0]
            if val==None:
                val=0
            mat.append(val)
            pos.append(chrom+":"+str(s)+"-"+str(e))
            
    x, y, pos, sindex, srt=_find_extremes(mat, pos)
    b=-x[sindex]*np.amax(y) +y[sindex]
    print("Finding super enhancer took ", time.time()-start_time, "sec")
    
    # print(len(_nearestgenes), _nearestgenes[:10])
    # print(srt.shape, srt[:10])
    _nearestgenes=[stiched_dit[chromse] for chromse in pos]
    
        
    fig, ax=plt.subplots(figsize=[6,4])
    ax.scatter(x[:sindex], y[:sindex],s=5,rasterized=True)
    ax.scatter(x[sindex:], y[sindex:], color="r",s=5,rasterized=True)
    ax.plot(x,x*np.amax(y)+b, color="gray")
    
    if closest_genes==True:
        for j in range(5):
            ax.text(x[-j-1],y[-j-1], ",".join(_nearestgenes[-j-1]))
    else:
        for j in range(5):
            ax.text(x[-j-1],y[-j-1], pos[-j-1])
    ax.text(0,y[-1]*0.7, "SE No.: "+str( y[sindex:].shape[0])+"\nCutoff: "+str(np.round(y[sindex],3)))
    ax.set_xlabel("Enhancers")
    ax.set_ylabel("Signal sum")
    ax.set_ylim(0,max(y)*1.05)
    
    tick_=x.shape[0]//3
    if 100<tick_<=1000:
        tick_=500
    elif 1000<tick_<=5000:
        tick_=1000
    elif 5000<tick_<=10000:
        tick_=5000
    else:
        tick_=10000
    ax.set_xticks(np.arange(0,x.shape[0], tick_)/x.shape[0],labels=np.arange(0,x.shape[0], tick_))
    plt.ticklabel_format(axis="y", style="sci", scilimits=(0,0))
    
    _save(save, "rank")
    
    plt.subplots_adjust(right=0.620, bottom=0.130)
    
    
    stitched_out=os.path.splitext(bed)[0]+"_SE.bed"
    with open(stitched_out, "w") as fout:
        
        if closest_genes==True:
            rank=1
            for _pos, _sig, _gr in zip(reversed(pos[sindex:]), reversed(y[sindex:]),reversed(_nearestgenes[sindex:])):
                chrom, se=_pos.split(":")
                s, e=se.split("-")
                fout.write("\t".join([chrom,str(s),str(e), ",".join(_gr), str(_sig),str(rank)+"\n"]))
                rank+=1
        
        else:
            rank=1
            for _pos, _sig in zip(reversed(pos[sindex:]), reversed(y[sindex:])):
                chrom, se=_pos.split(":")
                s, e=se.split("-")
                fout.write("\t".join([chrom,str(s),str(e), str(_sig),str(rank)+"\n"]))
                rank+=1
    print("The SE file was written in "+stitched_out)
    if go_analysis==True:
        print("Now, doing GO anlyses..")
        start_time=time.time()
        try:
            import gseapy
        except ImportError:
            raise ImportError('If you want GO analysis, you need to install gseapy. Try "pip install gseapy".')
        godf={0:[],1:[]}
        y=(y-np.mean(y))/np.std(y)
        for _sig, genes in zip(reversed(y), reversed(_nearestgenes)):
            for gene in list(genes):
                godf[0].append(gene)
                godf[1].append(_sig)
        #print(godf)
        godf=pd.DataFrame(godf)
        godf=godf.set_index(0)
        for name in gonames:
            gs_res = gseapy.prerank(rnk=godf, # or data='./P53_resampling_data.txt'
                         gene_sets=name,
                             threads=4,
                             min_size=5,
                             max_size=1000,
                             permutation_num=100, # reduce number to speed up testing
                             outdir=None, # don't write to disk
                             seed=6,
                             verbose=True,) # see what's going on behind the scenes)
            #print(gs_res.res2d)
            for index, row in gs_res.res2d.iterrows():
                #print(row)
                if row["FDR q-val"] <0.05:
                    t=row.Term
                    gseapy.gseaplot(rank_metric=gs_res.ranking,
                             term=t,
                             cmap=plt.cm.seismic,
                             **gs_res.results[t],
                             ofname=os.path.splitext(bed)[0]+"_"+name+"_gsea.pdf"
                             )
        print("GO analysis took ", time.time()-start_time)
    if plot_signals:
        
        if gff=="":
            raise Exception("Please provide a gff file to draw signals.")
        letters = string.ascii_lowercase
        tmpfile=''.join(random.choice(letters) for i in range(10))+".bed"
        highlights=[]
        with open(tmpfile, "w") as fout:
            rank=1
            for pos, sig in zip(reversed(pos[sindex:]), reversed(y[sindex:])):
                chrom, se=pos.split(":")
                s, e=se.split("-")
                fout.write("\t".join([chrom,str(int(s)-10000),str(int(e)+10000), str(sig),str(rank)+"\n"]))
                highlights.append([chrom,int(s),int(e)])
                rank+=1
                if rank==6:
                    break
                    
        plot_bigwig(files={sample: bigwig}, 
                    bed=tmpfile, gff=gff, 
                    step=100,
                    stack_regions="vertical",
                    highlight=highlights,n_jobs=n_jobs, save=save)
        os.remove(tmpfile)
    return {"ax": ax, "positions":pos, "signals":y,"nearest_genes":_nearestgenes}

def frip_calc(files: dict)

Expand source code

def frip_calc(files:dict,):
    
    pass

def plot_average(files: dict, bed: Union[str, list], order: list = [], extend: int = 500, palette: str = 'coolwarm', binsize: int = 10, clustering: str = '', n_clusters: int = 5, minval: Optional[float] = None, orientaion=False, save='', n_jobs: int = -1)

Plotting bigwig files centered at peak regions.

Parameters

files : dict: A dictionary whose keys are sample names and values are file names
bed : str: A bed file name containing the list of genomic regions
order : list: A sample order to plot. if a clustering option (except kmeans_all) is chosen, the first sample is the target of clustering analysis.
binsize : int: A bin size to reduce the bigwig signal resolution.
clustering : str: Availables: ["kmeans", "kmeans_all", "kmeans_auto"]
n_clusters : int: the number of clusters for clustering analysis. if you chose kmeans_auto, this option will be ignored.

Returns

dict {"values":data,"potisons":pos,"labels":labels}
values: dictionary containing sample names as the keys and signals as the values
positions: genomic positions sorted by signals displayed in the plot
labels: list of clustering labels if kmean clustering was performed

Raises

Notes

References

Examples

Expand source code

def plot_average(files: dict, 
                 bed: Union[str,list], 
                 order: list=[], extend: int=500, 
                 palette: str="coolwarm",
                 binsize: int=10,
                 clustering: str="",
                 n_clusters: int=5,
                 minval: Optional[float]=None,
                 orientaion=False,
                 save="",
                 n_jobs: int=-1):
    """
    Plotting bigwig files centered at peak regions.  
    
    Parameters
    ----------
    files : dict
        A dictionary whose keys are sample names and values are file names 
    bed : str
        A bed file name containing the list of genomic regions
    order : list
        A sample order to plot. if a clustering option (except kmeans_all) is chosen, the first sample is the target of clustering analysis. 
    binsize: int
        A bin size to reduce the bigwig signal resolution.
    clustering : str
        Availables: ["kmeans", "kmeans_all", "kmeans_auto"]
    n_clusters : int
        the number of clusters for clustering analysis. if you chose kmeans_auto, this option will be ignored.
    Returns
    -------
        dict {"values":data,"potisons":pos,"labels":labels}
        values: dictionary containing sample names as the keys and signals as the values
        positions: genomic positions sorted by signals displayed in the plot
        labels: list of clustering labels if kmean clustering was performed
        
    Raises
    ------
    Notes
    -----
    References
    ----------
    See Also
    --------
    Examples
    --------
    """
    if len(order)==0:
        order=list(files.keys())
    for k, v in files.items():
        assert type(v)==str, "filenames must be str, but {} was give.".format(type(v))
        if not os.path.isfile(v)==True:
            raise Exception("{} does not exist.".format(v))
            
    
    
    bw=pwg.open(files[order[0]])
    chrom_sizes=bw.chroms()
    
    if type(bed)==str:
        
        pos=[]
        bedchroms=set()
        with open(bed) as fin:
            for l in fin:
                if l.startswith("#"):
                    continue
                l=l.split()
                chrom, s, e=l[0],l[1],l[2]
                
                if not chrom in chrom_sizes:
                    raise Exception("There may be mismatches in chromosome names between the bed file and bigwig files.\n\
                the chromosome names in the bed file are: {}\n\
                those in the bigwig files are: {}".format(chrom, chrom_sizes.keys())) 
                
                s, e=int(s.replace(",","")),int(e.replace(",",""))
                center=(s+e)//2
                cs=center-extend
                ce=center+extend
                if cs <0 or chrom_sizes[chrom] < ce:
                    continue
                pos.append([chrom,cs,ce])
                bedchroms.add(chrom)
    else:
        pos=bed
    
    
    data={}
    data_mean=[]
    
    for i, sample in enumerate(order):
        bigwig=files[sample]
        data[sample]=[]
        vals, mean=zip(*Parallel(n_jobs=n_jobs, prefer="threads")(delayed(_read_and_reshape_bw)(chrom, s, e, bigwig, binsize) for chrom, s, e in pos))
        
        
        data[sample]=np.nan_to_num(vals)
        if i==0:
            data_mean=mean
        # for chrom, s, e in pos:
        #
        #     val=bw.values(chrom, s, e)
        #     #print(val)
        #     val=np.array(val)
        #     #print(val.shape)
        #     val=val.reshape([-1,binsize]).mean(axis=1)
        #     #print(val.shape)
        #     data[sample].append(val)
        #     if i==0:
        #         data_mean.append(bw.stats(chrom, s, e, exact=True)[0])
    labels=[]
    if clustering=="kmeans_all":
        mat=[]
        for sample in order:
            #print(np.array(data[sample]).shape)
            mat.append(data[sample])
        mat=np.concatenate(mat, axis=1)
        #print(mat.shape)
        mat=zscore(mat, axis=0)
        pca=PCA(n_components=5, random_state=1)
        xpca=pca.fit_transform(mat)
        kmean = KMeans(n_clusters=n_clusters, random_state=0,n_init=10)
        kmX=kmean.fit(xpca)
        labels=kmX.labels_
    elif clustering=="kmeans":
        mat=np.array(data[order[0]])
        kmean = KMeans(n_clusters=n_clusters, random_state=0,n_init=10)
        kmX=kmean.fit(mat)
        labels=kmX.labels_
    elif clustering=="kmeans_auto":
        mat=np.array(data[order[0]])
        optimalclusternum=_optimal_kmeans(mat, [2, 10])
        n_clusters=np.amax(optimalclusternum)
        kmean = KMeans(n_clusters=n_clusters, random_state=0,n_init=10)
        kmX=kmean.fit(mat)
        labels=kmX.labels_
        
    sortindex=np.argsort(data_mean)[::-1]
    pos=np.array(pos)[sortindex]

    plotindex=np.arange(0, len(pos), len(pos)//1000)
    fig, axes=plt.subplots(ncols=len(order), figsize=[2*len(order), 6])
    if len(order)==1:
        axes=[axes]
    for i, (sample, ax) in enumerate(zip(order, axes)):
        vals=data[sample]

        vals=np.array(vals)

        vals=vals[sortindex]
        if clustering!="":
            
            _labels=np.array(labels)[sortindex]
            sortindex2=np.argsort(_labels)
            vals=vals[sortindex2]
            _pos=pos[sortindex2]
            _labels=_labels[sortindex2]
            _labels=_labels[plotindex]
            ulabel, clabel=np.unique(_labels, return_counts=True)

        vals=vals[plotindex]
        if np.sum(vals)<=10**-6:
            raise Exception("Signals may be empty, or you may be using a wrong bed file.")
        if np.amax(vals) <= 1  and np.amin(vals)>=0:
            im=ax.imshow(vals,aspect="auto", cmap=palette, interpolation="none",norm=LogNorm(vmin=0.00001))
        elif np.amin(vals) < 0:
            vals=np.where(vals < 0, 0, vals)
            warnings.warn("The bigwig contains negative values. Is this coverage file?")
            im=ax.imshow(vals,aspect="auto", cmap=palette, interpolation="none",norm=LogNorm(vmin=1))
        elif minval!=None:
            im=ax.imshow(vals,aspect="auto", cmap=palette, interpolation="none",norm=LogNorm(vmin=minval))
        else:
            im=ax.imshow(vals,aspect="auto", cmap=palette, interpolation="none",norm=LogNorm(vmin=1))
        
        
        
        cbaxes = fig.add_axes([0.8*(i+1)/len(order), 0.93, 0.03, 0.06]) 
        cb = plt.colorbar(im, cax = cbaxes)
        
            
        # print(np.arange(0, vals.shape[1]+vals.shape[1]//4, vals.shape[1]//4))
        # print(np.arange(-extend, extend+extend//2, extend//2))
        ax.set_xticks(np.arange(0, vals.shape[1]+vals.shape[1]//4, vals.shape[1]//4),labels=np.arange(-extend, extend+extend//2, extend//2))
        ax.set_xlabel("Peak range [bp]")
        ax.set_title(sample)
        
        if clustering!="":
            total=0
            isep=0
            for label, count in zip(ulabel, clabel):
                
                total+=count
                if isep==ulabel.shape[0]-1:
                    break
                ax.plot([0, vals.shape[1]-1], [total,total], color="black")
                isep+=1
        ax.set_yticks([])
        ax.grid(False)
    _save(save, "heatmap")
    fig, axes2=plt.subplots(ncols=len(order), figsize=[4*len(order), 3])
    if len(order)==1:
        axes2=[axes2]
    maxval=0
    for i, (sample, ax) in enumerate(zip(order, axes2)):
        vals=data[sample]
        vals=np.array(vals)
        
        labels=np.array(labels)
        if clustering !="":
            for label, count in zip(ulabel, clabel):
                
                _vals=np.mean(vals[labels==label],axis=0)
                #print(_vals.shape)
                #print(vals.shape)
                ax.plot(np.arange(-extend, extend, 2*extend//(_vals.shape[0])),_vals, label=label)
                _maxval=np.max(_vals)
                if maxval < _maxval:
                    maxval=_maxval
            plt.legend()
        else:
            vals=vals.mean(axis=0)
            ax.plot(np.arange(-extend, extend, 2*extend//(vals.shape[0])),vals)
            _maxval=np.max(vals)
            if maxval < _maxval:
                maxval=_maxval 
        if i==0:
            ax.set_ylabel("Mean signal value")
        # print(np.arange(0, vals.shape[1]+vals.shape[1]//4, vals.shape[1]//4))
        # print(np.arange(-extend, extend+extend//2, extend//2))
        #ax.set_xticks(np.arange(0, vals.shape[1]+vals.shape[1]//4, vals.shape[1]//4),labels=np.arange(-extend, extend+extend//2, extend//2))
        ax.set_xlabel("Peak range [bp]")
        ax.set_title(sample)
    
    for ax in axes2:
        ax.set_ylim(0, maxval*1.05)
    
    _save(save, "average")
    
    return {"values":data,"potisons":_pos,"labels":labels, "axes":axes,"axes2":axes2}

def plot_bed_correlation(files: dict, method: str = 'pearson', palette: str = 'coolwarm', figsize: list = [6, 6], show_val: bool = True, clustermap_param: dict = {}, step: int = 100, show: bool = False)

Expand source code

def plot_bed_correlation(files:dict,
                         method: str="pearson",
                         palette: str="coolwarm",
                         figsize: list=[6,6], 
                         show_val: bool=True, 
                         clustermap_param: dict={},
                         step: int=100,
                         show: bool=False):
    for k, v in files.items():
        assert type(v)==str, "filenames must be str, but {} was give.".format(type(v))
        if not os.path.isfile(v)==True:
            raise Exception("{} does not exist.".format(v))
    bedintervals={}
    minmax={}
    for sample, bedfile in files.items():
        bedintervals[sample]={}
        with open(bedfile) as fin:
            for l in fin:
                if l.startswith("#"):
                    continue
                l=l.split()
                chrom=l[0]
                s, e=int(l[1]), int(l[2])
                if not chrom in bedintervals[sample]:
                    bedintervals[sample][chrom]=[]
                if not chrom in minmax:
                    minmax[chrom]=[]
                bedintervals[sample][chrom].append([s,e])
                minmax[chrom].append(s)
                minmax[chrom].append(e)
    minmaxval={}
    for chrom, se in minmax.items():
        minmaxval[chrom]=[np.min(se), np.max(se)]
    
    mat=[]
    samples=[]
    for sample, chrom_intervals in bedintervals.items():
        print(sample)
        tmp=[]
        for chrom, intervals in chrom_intervals.items():
            tmp2=[0 for _ in range(minmaxval[chrom][0], minmaxval[chrom][1],step)]
            for s, e in intervals:
                s=s-minmaxval[chrom][0]
                e=e-minmaxval[chrom][0]
                for i in range(s, e,step):
                    tmp2[i//step]=1
            tmp.extend(tmp2)
        mat.append(tmp)
        samples.append(sample)
    mat=np.array(mat)
    mat = mat[:, np.any(mat, axis=0)]
    print(mat[:100,:100])
    if method=="pearson":
        #dmat=Parallel(n_jobs=-1)(delayed(_calc_pearson)(ind, mat) for ind in list(it.combinations(range(mat.shape[0]), 2)))
        dmat=np.corrcoef(mat)
        dmat=np.array(dmat)
        #dmat=squareform(dmat)
        print(dmat)
        #dmat+=np.identity(dmat.shape[0])
    else:
        dmat=squareform(pdist(mat, method))
    
            
    
    g=sns.clustermap(data=dmat,xticklabels=samples,yticklabels=samples,
               method="ward", cmap=palette,
               col_cluster=True,
               row_cluster=True,
               figsize=figsize,
               rasterized=True,
               #cbar_kws={"label":"Pearson correlation"}, 
               annot=show_val,
               **clustermap_param)
    if method=="pearson":
        title="Pearson correlation"
    else:
        title=method+" distance"
    g.cax.set_ylabel(title, rotation=-90,va="bottom")
    plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0)  # For y axis
    plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90) # For x axis
    if show:
        plt.show()
        
    return {"correlation": dmat,"samples":samples}

def plot_bigwig(files: dict, bed: Union[str, list], gff: str, step: int = 100, stack_regions: str = 'horizontal', highlight: Union[str, list] = [], highlightname='', n_jobs=-1, save: str = '') ‑> dict

Plotting bigwig files in specified genomic regions in the bed file.

Parameters

files : dict: A dictionary whose keys are sample names and values are file names
bed : str: A bed file name containing the list of genomic regions
gff : str: A gff3 file with a corresponding genome version
step : int: A bin size to reduce the bigwig signal resolution.
show : bool: Whether or not to show the figure.

Returns

{"ax":axes,"values":mat,"genes":geneset,"positions":pos} : dict A dictionary containing matplot axes, signal values, geneset, and positions. Raises

Notes

References

Examples

Expand source code

def plot_bigwig(files: dict, 
                bed: Union[str, list], 
                gff: str,
                step: int=100,
                stack_regions: str="horizontal", 
                highlight: Union[str, list]=[],
                highlightname="",
                n_jobs=-1,
                save: str="")->dict:
    """
    Plotting bigwig files in specified genomic regions in the bed file.  
    
    Parameters
    ----------
    files : dict
        A dictionary whose keys are sample names and values are file names 
    bed : str
        A bed file name containing the list of genomic regions
    gff : str
        A gff3 file with a corresponding genome version
    step: int
        A bin size to reduce the bigwig signal resolution.
    show : bool
        Whether or not to show the figure.
    Returns
    -------
    {"ax":axes,"values":mat,"genes":geneset,"positions":pos} : dict
        A dictionary containing matplot axes, signal values, geneset, and positions.
    Raises
    ------
    Notes
    -----
    References
    ----------
    See Also
    --------
    Examples
    --------
    """ 
    for k, v in files.items():
        assert type(v)==str, "filenames must be str, but {} was give.".format(type(v))
        if not os.path.isfile(v)==True:
            raise Exception("{} does not exist.".format(v))
    if type(bed)==str:
        posrange=[]
        pos=[]
        with open(bed) as fin:
            
            for l in fin:
                l=l.split()
                chrom, s, e=l[0],l[1],l[2]
                posrange.append(pr.from_dict({"Chromosome": [chrom], "Start": [int(s.replace(",",""))], "End": [int(e.replace(",",""))]}))
                pos.append([chrom,int(s.replace(",","")),int(e.replace(",",""))])  
    else:
        pos=bed
        
    if type(highlight)==str:
        _highlight=[]
        with open(bed) as fin:
            for l in fin:
                l=l.split()
                chrom, s, e=l[0],l[1],l[2]
                _highlight.append([chrom,int(s.replace(",","")),int(e.replace(",",""))])
        highlight=_highlight
    
    time_start=time.time()
    if gff.endswith("gff") or gff.endswith("gff3"):
        gff_dict=_readgff2(gff, "gene", others=["gene_name", "Strand"])
    elif gff.endswith("gtf"):
        gff_dict=_readgtf2(gff, "transcript", others=["gene_name", "Strand"])
    else:
        raise Exception("The gff/gtf file name is required to have either gff, gff3, or gtf extension.")
    gff_ob=pr.from_dict(gff_dict)
    
    #
    # if gff.endswith("gff") or gff.endswith("gff3"):
    #     gff_ob=pr.read_gff3(gff)
    #     gff_ob=gff_ob[gff_ob.Feature=="gene"]
    #     gffformat="gff"
    # elif gff.endswith("gtf"):
    #     gff_ob=pr.read_gtf(gff)
    #     gff_ob=gff_ob[gff_ob.Feature=="transcript"]
    #     gffformat="gtf"
    print("Reading the gene annotation file took", time.time()-time_start)
    # geneset=[]
    # for chrom, start, end in pos:
    #     geneset.append(gff_ob.get_genes(chrom, start, end))
    time_start=time.time()
    geneset=[]
    _grs=Parallel(n_jobs=n_jobs)(delayed(gff_ob.overlap)(gr) for gr in posrange)
    # for gr in posrange:
    #     gr2=gff_ob.intersect(gr)
    for gr2 in _grs:
        #gr2=gff_ob.intersect(gr)
        if len(gr2)==0:
            geneset.append([])
        else:
            #print(gr2)
            # if gffformat=="gff":
            #     gr2=gr2[gr2.Feature=="gene"]
            # elif gffformat=="gtf":
            #     gr2=gr2[gr2.Feature=="transcript"]
                
            tmp=[]
            for gene_name, start, end, ori in zip(gr2.gene_name, gr2.Start, gr2.End, gr2.Strand):
                tmp.append([gene_name, start, end, ori])
            geneset.append(tmp)
    #print(geneset)
    #geneset=Parallel(n_jobs=-1)(delayed(gff_ob.get_genes)(chrom, start, end) for chrom, start, end in pos)
    print("Obtaining genes took", time.time()-time_start)
            
    time_start=time.time()
    mat={}
    for samplename, f in files.items():
        bw=pwg.open(f)
        #h, t=os.path.split(f)
        #samplename=t.replace(prefix, "").replace(suffix_set, "")
        mat[samplename]=[]
        
        for chrom, s, e in pos:
            val=bw.values(chrom, s, e)
            
            mat[samplename].append(val)
    
    
    
    print("Reading bigwig files took", time.time()-time_start)
    
    if stack_regions=="vertical":
        fig, axes=plt.subplots(nrows=len(pos)*2,ncols=len(files),figsize=[10,10],gridspec_kw={
                           'height_ratios': [2,1]*len(pos),"hspace":0},
                       constrained_layout = True)
        if len(axes.shape)==1:
            axes=axes.reshape([-1,1])
        sample_order=sorted(mat.keys())
        ticks_labels=[]
        for si, sample in enumerate(sample_order):
            
            vals=mat[sample]
            ymax=0
            for posi, (genes, val) in enumerate(zip(geneset, vals)):
                if len(val)==0:
                    continue
                ax=axes[posi*2,si]
                chrom, s, e=pos[posi]
                val=np.array(val)
                vallen=val.shape[0]
                remains=vallen%step
                _e=e-remains
                x=np.arange(s, _e,step)
                #print(s, e, _e, vallen, step)
                val=val[:vallen-remains].reshape(-1, step).mean(axis=1)
                _ymax=np.amax(val)
                if _ymax >  ymax:
                    ymax =_ymax
                if len(highlight)!=0:
                    hchrom, hs, he=highlight[posi]
                    ax.fill_between([hs, he], [_ymax, _ymax], alpha=0.5, color="r")
                ax.fill_between(x, val,edgecolor='face')
                
                
                if si==0:
                    ax.set_title(sample+" "+chrom+":"+str(s)+"-"+str(e))
                    ticks_labels.append([[np.amin(x),np.amax(x)],ax.get_xticks(), ax.get_xticklabels()])
                ax.set_xticks([],labels=[])
            occupied=[[] for i in range(8)]
        
        

        
        for posi in range(len(pos)):
            interval=pos[posi][2]-pos[posi][1]
            axes[posi*2+1,si].plot(ticks_labels[posi][0],[0,0],alpha=0)
            for genename, gs, ge,ori in geneset[posi]:
                slot=0
                for oi in range(len(occupied)):
                    oc=False
                    if len(occupied[oi])==0:
                        occupied[oi].append([gs, ge])
                        oc=False
                        slot=oi
                    else:
                        for tmps, tmpe in occupied[oi]:
                            if tmps<=gs<= tmpe or tmps<=ge<=tmpe:
                                oc=True
                        if oc==False:
                            occupied[oi].append([gs, ge])
                            slot=oi
                    if oc==False:
                        break
                if gs < pos[posi][1]:
                    gs=pos[posi][1]
                if ge > pos[posi][2]:
                    ge=pos[posi][2]
                axes[posi*2+1,si].plot([gs, ge], [slot*1,slot*1], color="gray")
                if ori=="+":
                    axes[posi*2+1,si].plot([ge-interval/32, ge], [slot*1+1,slot*1], color="gray")
                elif ori=="-":
                    axes[posi*2+1,si].plot([gs, gs+interval/32], [slot*1,slot*1+1], color="gray")
                
                axes[posi*2+1,si].text((gs+ge)/2, slot*1, genename, ha="center")
            for oi, ol in enumerate(occupied):
                if len(ol)==0:
                    maxslot=oi
                    break
            axes[posi*2+1,si].set_ylim(-0.5,maxslot+0.5)
            #axes[posi*2+1,si].ticklabel_format(useOffset=False)
            axes[posi*2+1,si].set_yticks([],labels=[])
            axes[posi*2+1,si].set_xticks(ticks_labels[posi][1])
            axes[posi*2+1,si].set_ylabel("Genes")
        if save !="":
            if save.endswith(".pdf") or save.endswith(".png") or save.endswith(".svg"):
                h, t=os.path.splitext(save)
                plt.savefig(h+"_plot"+t)
            else:
                plt.savefig(save+"_plot.pdf")
    else:
        fig, axes=plt.subplots(nrows=len(files)+1,ncols=len(pos),gridspec_kw={
                           'height_ratios': [2]*len(files)+[1]})
        
        if len(axes.shape)==1:
            axes=axes.reshape([-1,1])
        sample_order=sorted(mat.keys())
        for si, sample in enumerate(sample_order):
            vals=mat[sample]
            ymax=0
            #print(ymax)
            for posi, (genes, val) in enumerate(zip(geneset, vals)):
                if len(val)==0:
                    continue
                ax=axes[si,posi]
                chrom, s, e=pos[posi]
                val=np.array(val)
                vallen=val.shape[0]
                remains=vallen%step
                _e=e-remains
                x=np.arange(s, _e,step)
                
                #print(s, e, _e, vallen, step)
                val=val[:vallen-remains].reshape(-1, step).mean(axis=1)
                _ymax=np.amax(val)
                if _ymax >  ymax:
                    ymax =_ymax

                ax.fill_between(x, val,edgecolor='face')

                if posi==0:
                    ax.set_ylabel(sample)
    
                ax.set_xticks([],labels=[])
                if si==0:
                    ax.set_title(chrom+":"+str(s)+"-"+str(e))
                ax.grid(False)
                    
            occupied=[[] for i in range(8)]
            for posi in range(len(pos)):
                axes[si,posi].set_ylim(0,ymax*1.01)
                if posi >0:
                    axes[si,posi].set_yticks([],labels=[])
        
        for posi in range(len(pos)):
            interval=pos[posi][2]-pos[posi][1]
            for genename, gs, ge,ori in geneset[posi]:
                slot=0
                for oi in range(len(occupied)):
                    oc=False
                    if len(occupied[oi])==0:
                        occupied[oi].append([gs, ge])
                        oc=False
                        slot=oi
                    else:
                        for tmps, tmpe in occupied[oi]:
                            if tmps<=gs<= tmpe or tmps<=ge<=tmpe:
                                oc=True
                        if oc==False:
                            occupied[oi].append([gs, ge])
                            slot=oi
                    if oc==False:
                        break
                if gs < pos[posi][1]:
                    gs=pos[posi][1]
                if ge > pos[posi][2]:
                    ge=pos[posi][2]
                axes[si+1,posi].plot([gs, ge], [slot*1,slot*1], color="gray")
                if ori=="+":
                    axes[si+1,posi].plot([ge-interval/32, ge], [slot*1+0.1,slot*1], color="gray")
                elif ori=="-":
                    axes[si+1,posi].plot([gs, gs+interval/32], [slot*1,slot*1+0.1], color="gray")
                
                axes[si+1,posi].text((gs+ge)/2, slot*1, genename, ha="center")
                #axes[si+1,posi].grid(False)
            for oi, ol in enumerate(occupied):
                if len(ol)==0:
                    maxslot=oi
                    break
            axes[si+1,posi].set_ylim(-0.5,maxslot+0.5)
            axes[si+1,posi].ticklabel_format(useOffset=False)
            axes[si+1,posi].set_yticks([],labels=[])
            
        
        plt.tight_layout(h_pad=-1,w_pad=-1)
        _save(save, "plot")
        
    return {"ax":axes,"values":mat,"genes":geneset,"positions":pos}

def plot_bigwig_correlation(files: dict, chrom: str = 'chr1', step: int = 1000, palette: str = 'coolwarm', figsize: list = [6, 6], show_val: bool = True, clustermap_param: dict = {}, peakfile: str = '', show: bool = False, n_jobs: int = -1, save: str = '') ‑> Dict[~KT, ~VT]

Calculate pearson correlations and draw a heatmap for bigwig files.

Parameters

files : dict: A dictionary whose keys are sample names and values are bigwig file names
chrom : str, optional: A chromsome to cal
step : int, optional: A bin size to reduce the bigwig signal resolution.
palette : str, optional
show_val : bool, optional: Whether or not to show correlation values on the heatmap
peakfile : str, optional: A peak file/bed file containing genome regions of interest. If given this option, the correlation will be calculated using the genome regions listed in this file.
show : bool, optional: Whether or not to show the figure.
n_jobs : int, optional: The number of threads to use. It will use all available threads by default.

Returns

{"ax":axes,"values":mat,"genes":geneset,"positions":pos} : dict A dictionary containing matplot axes, signal values, geneset, and positions. Raises

Notes

References

Examples

Expand source code

def plot_bigwig_correlation(files: dict, 
                            chrom: str="chr1",
                            step: int=1000,
                            palette: str="coolwarm",
                            figsize: list=[6,6],
                            show_val: bool=True,
                            clustermap_param: dict={},
                            peakfile: str="",
                            #show_val: bool=False,
                            show: bool=False,
                            n_jobs: int=-1, save: str="") -> Dict:
    
    """
    Calculate pearson correlations and draw a heatmap for bigwig files.  
    
    Parameters
    ----------
    files : dict
        A dictionary whose keys are sample names and values are bigwig file names 
    chrom : str, optional
        A chromsome to cal
    step: int, optional
        A bin size to reduce the bigwig signal resolution.
    palette: str, optional
    show_val: bool, optional
        Whether or not to show correlation values on the heatmap
    peakfile: str, optional
        A peak file/bed file containing genome regions of interest. If given this option, the correlation will be calculated using the genome regions listed in this file.
    show : bool, optional
        Whether or not to show the figure.
    n_jobs : int, optional
        The number of threads to use. It will use all available threads by default. 
    Returns
    -------
    {"ax":axes,"values":mat,"genes":geneset,"positions":pos} : dict
        A dictionary containing matplot axes, signal values, geneset, and positions.
    Raises
    ------
    Notes
    -----
    References
    ----------
    See Also
    --------
    Examples
    --------
    """
    for k, v in files.items():
        assert type(v)==str, "filenames must be str, but {} was give.".format(type(v))
        if not os.path.isfile(v)==True:
            raise Exception("{} does not exist.".format(v))
    
    mat=[]
    samples=[]
    bw=pwg.open(list(files.values())[0])
    chrom_sizes=bw.chroms()
    
    
        
    
    if peakfile!="":
        # peaks=read_peaks(peakfile)
        # for k, f in files.items():
        #     bw=pwg.open(f)
        #     tmp=[]
        #     for chrom, se in peaks.items():
        #         for s, e in se:
        #             val=bw.stats(chrom,s,e, exact=True)[0]
        #             if val==None:
        #                 val=0
        #         tmp.append(val)
        #     mat.append(tmp)
        #     samples.append(k)
        peaks=_read_peaks(peakfile)
        for k, f in files.items():
            bw=pwg.open(f)
            
            tmp=Parallel(n_jobs=n_jobs)(delayed(_read_bw_stats)(chrom, s, e, f) for chrom, s, e in peaks)
            mat.append(tmp)
            samples.append(k)
        
    elif chrom !="all":
        chrom_size=chrom_sizes[chrom]
        
        samples=[]
        mat=[]
        #samples=list(files.keys())
        #mat=Parallel(n_jobs=n_jobs)(delayed(read_bw)(files[s],chrom, chrom_size, step) for s in samples)
        for k, f in files.items():
            bw=pwg.open(f)
            val=np.array(bw.values(chrom,0,chrom_size))
            rem=chrom_size%step
            val=val[:val.shape[0]-rem]
            print(val.shape)
            val=np.nan_to_num(val)
            val=val.reshape([-1,step]).mean(axis=1)
            print(val.shape)
            mat.append(val/np.sum(val))
            samples.append(k)
    elif chrom=="all":
        chroms=chrom_sizes.keys()
        for k, f in files.items():
            
            #tmp=Parallel(n_jobs=n_jobs)(delayed(read_bw)(f, chrom, 0, chrom_sizes[chrom], f) for chrom, s, e in chroms)
            bw=pwg.open(f)
            tmp=[]
            for chrom in chroms:
                chrom_size=chrom_sizes[chrom]
                val=np.array(bw.values(chrom,0,chrom_size))
                rem=chrom_size%step
                val=val[:val.shape[0]-rem]
                val=np.nan_to_num(val)
                val=val.reshape([-1,step]).mean(axis=1)
                tmp.append(val/np.sum(val))
            
            samples.append(k)
            mat.append(np.concatenate(tmp))
    else:
        raise Exception("please specify chrom or peakfile options.")
    
    mat=np.array(mat)
    dmat=Parallel(n_jobs=-1)(delayed(_calc_pearson)(ind, mat) for ind in list(it.combinations(range(mat.shape[0]), 2)))
    dmat=np.array(dmat)
    get_reusable_executor().shutdown(wait=True)
    dmat=squareform(dmat)
    print(dmat)
    dmat+=np.identity(dmat.shape[0])
    g=sns.clustermap(data=dmat,xticklabels=samples,yticklabels=samples,
               method="ward", cmap=palette,
               col_cluster=True,
               row_cluster=True,
               figsize=figsize,
               rasterized=True,
               #cbar_kws={"label":"Pearson correlation"}, 
               annot=show_val,
               **clustermap_param)
    g.cax.set_ylabel("Pearson correlation", rotation=-90,va="bottom")
    plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0)  # For y axis
    plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90) # For x axis
    
    
    _save(save, "correlation")
    if show:
        plt.show()
    return {"correlation": dmat,"samples":samples}