################################################################################ #This model assumes an ancestral population of size (th1+th2) that split into 2 #daughter populations # both population sizes stayed constant after the split ################################################################################ # example of ms command: # ms 50 1 -t 5 -r 20 1000 -I 2 25 25 -m 1 2 0.5 -m 2 1 0.5 -n 2 5 -eN 0.1 2 # -ej 0.1 2 1 -eM 0.1 0 # this means: # ms 50 1 : 50 samples, 1 repetition # -t 5 : theta=5 # -r 20 1000 : recombination rate is 20, with 1000 positions # -I 2 25 25 : 2 Populations, 25 samples from each # -m 1 2 0.5 : Migration from Pop 1 to Pop 2 is 0.5 (backward in time) # -m 2 1 0.5 : Migration from Pop 2 to Pop 1 is 0.5 (backward in time) # -n 2 5 : initial size of subpop 2 is 5 * initial size of subpop 1. # -eN 0.1 2 : at time 0.1 all subpop's sizes change to 2 # -ej 0.1 2 1 : at time 0.1 all lineage of pop 2 move to pop 1 print ("starting script") #size of population 1 and 2 samplesize1 <- 25 samplesize2 <- 25 anzsumstat <- 23 ##values of tau, mig and sizeratio for training and for estimation phase tbase <- (5/6) mbase <- (5/6) qbase <- (87/100) tmax <- 20 # maximum value of divergence time mmax <- 5 # maximum for migration qmax <- 10 # maximum for sizeratio (between pop2 and pop1) at present tau <- tbase^(39:0)*tmax # grid for divergence time mig <- mbase^(39:0)*mmax # grid for migration sizeratio <- qbase^(39:0)*qmax # grid for sizeratio ############################################ # compute vector of 23 summary statistics # # in: jsfs, a 2-dim-array # out: array of 23 summaryStatistics ############################################ sumstat <- function(jsfs) { n <- nrow(jsfs) m <- ncol(jsfs) c(sum(jsfs[1,2:3]), sum(jsfs[2:3,1]), sum(jsfs[1,4:(m-3)]), sum(jsfs[4:(n-3),1]), sum(jsfs[1,(m-2):(m-1)]), sum(jsfs[(n-2):(n-1),1]), sum(jsfs[2:3,2:3]), sum(jsfs[2:3,4:(m-3)]), sum(jsfs[4:(n-3),2:3]), sum(jsfs[(n-2):(n-1),4:(m-3)]), sum(jsfs[4:(n-3),(m-2):(m-1)]), sum(jsfs[2:3,(m-2):(m-1)]), sum(jsfs[(n-2):(n-1),2:3]), sum(jsfs[4:(n-3),4:(m-3)]), sum(jsfs[(n-2):(n-1),(m-2):(m-1)]), jsfs[1,m], jsfs[n,1], sum(jsfs[n,2:3]), sum(jsfs[2:3,m]), sum(jsfs[n,4:(m-3)]), sum(jsfs[4:(n-3),m]), sum(jsfs[n,(m-2):(m-1)]), sum(jsfs[(n-2):(n-1),m]) ) } ############################################################################# ### DO ONLY IF TRAINING DATA NOT AVAILABLE: ### ### generate training data (data sets for each parameter combination) ### out: saves joint site frequency spectrum (JSFS) as Feld2.rda ############################################################################# if (FALSE) { feld <- array(rep(0,40*40*40*(samplesize1+1)*(samplesize2+1)),c(40,40,40, (samplesize1+1),(samplesize2+1))) #for each of the 40*40*40 parameter combinations generate 10 datasets with #each 7 loci with ms (this should not changed) for (wdh in 1:10) { for (t in 1:40){ for(m in 1:40){ for(s in 1:40) { cat(t,m,s,"\n") system(paste("~/bin/ms/msdir/ms ",(samplesize1+samplesize2)," 7 -t 5 -r 20 1000 -I 2 ",samplesize1," ",samplesize2," -m 1 2 ",mig[m]," -m 2 1 ",mig[m]," -n 2 ",sizeratio[s]," -eN ",tau[t]," ",1+sizeratio[s]," -ej ",tau[t]," 2 1 > msoutput")) zeilen <- readLines("msoutput") # this for-loop enables to reads out lines that contain the sequences for(z in 0:6*(4+samplesize1+samplesize2)+7) { locus <- as.matrix(data.frame (strsplit(zeilen[z:(z+samplesize1+samplesize2-1)],""))) #for each locus: count number of individuals with mutations in #population 1 (x) and in population 2 (y); #"+1" needs to be added b/c index of arrays in R start with 1 for(i in 1:nrow(locus)) { x <- sum(locus[i,1:samplesize1]=="1")+1 y <- sum(locus[i,(samplesize1+1):(samplesize1+samplesize2)]=="1")+1 feld[t,m,s,x,y] <- feld[t,m,s,x,y] + 1 } } }#s }#m }#t }#wdh save(feld,file="Feld2.rda",compress=TRUE) #save the JSFS of the training data } ############################################################################## ### DO ONLY IF local regression coefficients NOT ALREADY AVAILABLE: ### ### Generate 8x8x8x23-array. Each entry is a linear model for a ### 5x5x5-block of parameter values for one of the 23 summary statistics. ### out: coefficients of linear models saved in "Modellfeld.rda" ############################################################################## #variables: u*= lower bound; o*=upper bound in the parameter block (core) #d*= gives the size of the complete window including overlapping parts # (values at the edges=8 otherwise 11) if (FALSE) { load("Feld2.rda") Modellfeld <- array(rep(-1,8*8*8*anzsumstat*6),c(8,8,8,anzsumstat,6)) for(x in 0:7) for(y in 0:7) for(z in 0:7) { cat(x,y,z,"\n") utau <- x*5+1 um1 <- y*5+1 um2 <- z*5+1 otau <- utau+4 om1 <- um1+4 om2 <- um2+4 dtau <- min(40,otau+3)-max(1,utau-3)+1 dm1 <- min(40,om1+3)-max(1,um1-3)+1 dm2 <- min(40,om2+3)-max(1,um2-3)+1 #dat has all parameter combinations of tau, m, size, and anzSumstat #"gew"= weighting the portion of window that is outside and inside core #"wert"= value at that particular index of sumstat vector #"stat"= numbers from 1 to 23 indicating which sumstat dat <- data.frame("tau"=rep(max(1,utau-3):min(40,otau+3), rep(dm1*dm2*anzsumstat,dtau)), "m1"=rep(rep(max(1,um1-3):min(40,om1+3), rep(dm2*anzsumstat,dm1)),dtau), "m2"=rep(rep(max(1,um2-3):min(40,om2+3), rep(anzsumstat,dm2)),dtau*dm1), "stat"=rep(1:anzsumstat,dtau*dm1*dm2), "wert"=rep(NA,dtau*dm1*dm2*anzsumstat), "gew"=rep(1,dtau*dm1*dm2*anzsumstat)) # weighting of parts outside core (values range from 0.125-0.5) dat$gew[dat$tauotau]<-dat$gew[dat$tauotau]/2 dat$gew[dat$m1om1] <- dat$gew[dat$m1om1]/2 dat$gew[dat$m2om2] <- dat$gew[dat$m2om2]/2 for (i in max(1,utau-3):min(40,otau+3)) for (j in max(1,um1-3):min(40,om1+3)) for (k in max(1,um2-3):min(40,om2+3)) #sumstat gets calculated for the given index combination (i,j,k) dat[dat$tau==i & dat$m1==j & dat$m2==k,"wert"] <- sumstat(feld[i,j,k,,]) for(s in 1:anzsumstat) { mod <- glm(wert~tau+m1+m2,data=dat,subset= stat==s,family=poisson, weights=gew) Modellfeld[x+1,y+1,z+1,s,] <- c(mod$coef,mod$conv, sum(dat$wert[dat$stat==s])) if (!mod$conv) cat(x,y,z,s," did not converge, sum = ", sum(dat$wert[dat$stat==s]),"\n") } } save(Modellfeld,file="Modellfeld.rda",compress=TRUE,ascii=TRUE) #Modelfeld[x,y,z,s,1:4]= coefficients of glm; [...,5]=convergence; # [...,6]=sum over sumstat==s } ############################################################################## # estimates 3 parameters and theta from vector of summary statistics # # in: # summa : vector of summary statistics as given by function sumstat # modfeld: field of local regression coefficients estimated from trainig data # method: cent (default), wcent, wbin, ibbin # out: estimates for 3 parameters and theta per locus ############################################################################## schaetze <- function(summa,modfeld,method="cent") { if(method=="wcent") { cat("Using moment estimator for each parameter\n") } else if(method=="cent") { cat("Quick guess\n") } else if(method=="wbin") { cat("Performing medium fast estimation...\n") } else cat("Careful optimization; please be patient...\n") t <- rep(1:8,64) m1 <- rep(rep(1:8,rep(8,8)),8) m2 <- rep(1:8,rep(64,8)) logL <- rep(NA,512) theta <- rep(NA,512) for (i in 1:512) { #'*5-2' to get to the midpoint of bin theta[i] <- sum(summa)/sum(exp(modfeld[t[i],m1[i],m2[i],1:anzsumstat,1:4]%*% c(1,t[i]*5-2,m1[i]*5-2,m2[i]*5-2)))*350 ## FAST METHOD if(method=="cent" || method=="wcent") { logL[i] <- 0 for (j in 1:anzsumstat) { if (modfeld[t[i],m1[i],m2[i],j,5]<0.5) { # 350 = 10 * 5 *7 = repetitions * theta* #loci # divided by 350 to calculate #mutations per repetion per loci for theta=1 lambda <- max(0.5,modfeld[t[i],m1[i],m2[i],j,6])/350*theta[i] } else { lambda <- exp(modfeld[t[i],m1[i],m2[i],j,1:4]%*% c(1,t[i]*5-2,m1[i]*5-2,m2[i]*5-2))/350*theta[i] } logL[i] <- logL[i]+summa[j]*log(lambda)-lambda } } ## MEDIUM FAST METHOD else { if(method=="wbin") { optfunk <- function(par) { # if outside the 1..8 block don't proceed further if(min(par)<0.5 || max(par)>8.5) return(1e11) logL[i] <- 0 for (j in 1:anzsumstat) { # if the glm did not converge take 0.5 (i.e. a really small value) # as value for sumstat if (modfeld[t[i],m1[i],m2[i],j,5]<0.5) { lambda <- max(0.5,modfeld[t[i],m1[i],m2[i],j,6])/350*theta[i] } else { lambda <- exp(modfeld[t[i],m1[i],m2[i],j,1:4]%*% c(1,par[1]*5-2,par[2]*5-2,par[3]*5-2))/350*theta[i] } logL[i] <- logL[i]+summa[j]*log(lambda)-lambda } -logL[i] } OOO <- optim(c(t[i],m1[i],m2[i]),optfunk,lower=c(t[i],m1[i],m2[i])-0.5, upper=c(t[i],m1[i],m2[i])+0.5,method="L-BFGS-B") t[i]<-OOO$par[1] m1[i]<-OOO$par[2] m2[i]<-OOO$par[3] logL[i]<- -OOO$value } ### SLOW METHOD: here comes the slowest, most careful method else { if(i>0 && i%%16==0) { cat((i*100)%/%512," % done\n") } optfunk <- function(par) { if(min(par)<0.5 || max(par)>8.5) return(1e11) logL[i] <- 0 tt <- round(par[1]) wt <- 1 mm1 <- round(par[2]) wm1 <- 1 mm2 <- round(par[3]) wm2 <- 1 if(par[1] - tt > 0 && par[1] < 8) { tt[2] <- tt[1]+1 wt <- 1-(par[1] - tt)^2 wt <- wt/sum(wt) } else if(par[1] - tt < 0 && par[1] > 1) { tt[2] <- tt[1]-1 wt <- 1-(par[1] - tt)^2 wt <- wt/sum(wt) } if(par[2] - mm1 > 0 && par[2] < 8) { mm1[2] <- mm1[1]+1 wm1 <- 1-(par[2] - mm1)^2 wm1 <- wm1/sum(wm1) } else if(par[2] - mm1 < 0 && par[2] > 1){ mm1[2] <- mm1[1]-1 wm1 <- 1-(par[2] - mm1)^2 wm1 <- wm1/sum(wm1) } if(par[3] - mm2 > 0 && par[3] < 8) { mm2[2] <- mm2[1]+1 wm2 <- 1-(par[3] - mm2)^2 wm2 <- wm2/sum(wm2) } else if(par[3] - mm2 < 0 && par[3] > 1){ mm2[2] <- mm2[1]-1 wm2 <- 1-(par[3] - mm2)^2 wm2 <- wm2/sum(wm2) } for (j in 1:anzsumstat) { lambda <- 0 partlambda<-array(rep(NA,length(tt)*length(mm1)*length(mm2)), c(length(tt),length(mm1),length(mm2))) for(k1 in 1:length(tt)) for(k2 in 1:length(mm1)) for(k3 in 1:length(mm2)){ if (modfeld[tt[k1],mm1[k2],mm2[k3],j,5]<0.5) { partlambda <- max(0.5,modfeld[tt[k1],mm1[k2],mm2[k3],j,6])/350*theta[i] } else { partlambda <- exp(modfeld[tt[k1],mm1[k2],mm2[k3],j,1:4]%*% c(1,par[1]*5-2,par[2]*5-2,par[3]*5-2))/350*theta[i] } lambda <- lambda+wt[k1]*wm1[k2]*wm2[k3]*partlambda } logL[i] <- logL[i]+summa[j]*log(lambda)-lambda } return(-logL[i]) } # end of optfunk OOO <- optim(c(t[i],m1[i],m2[i]),optfunk, lower=c(0.50001,0.50001,0.50001), upper=c(8.49999,8.49999,8.49999),method="L-BFGS-B") t[i]<-OOO$par[1] m1[i]<-OOO$par[2] m2[i]<-OOO$par[3] logL[i]<- -OOO$value } } } #the optimal parameters are contained in the vector x x <- (1:512)[logL==max(logL,na.rm=TRUE) & !is.na(logL)] ## WEIGHTED VOTE METHOD if(method=="wcent") { tauest <- 0 m12est <- 0 m21est <- 0 wsum <- 0 logthetaest <- 0 slL <- sum(logL) logL <- logL - max(logL,na.rm=T) for(i in 1:512) { if(!is.na(logL[i])) { wsum <- wsum + exp(logL[i]) tauest <- tauest + exp(logL[i])*t[i] m12est <- m12est + exp(logL[i])*m1[i] m21est <- m21est + exp(logL[i])*m2[i] logthetaest <- logthetaest + exp(logL[i])*log(theta[i]) } } return(list(tau=tbase^(40-(tauest/wsum*5-2))*tmax, mig=mbase^(40-(m12est/wsum*5-2))*mmax, sizeratio=qbase^(40-(m21est/wsum*5-2))*qmax, theta=exp(logthetaest/wsum)/anzLoci)) } else return(list(tau=tbase^(40-(t[x]*5-2))*tmax, mig=mbase^(40-(m1[x]*5-2))*mmax, sizeratio=qbase^(40-(m2[x]*5-2))*qmax, theta=theta[x]/anzLoci, logL[x])) #this is part of the composite likelihood } ### END of function schaetze ############################################################# # Example with 100 loci ######################################################## if(FALSE) { #samplesizes are the same as for the training data sets anzLoci <- 100 mig <- 0.5 #symmetric migration rate between both populations theta <- 13 #per locus tau <- 1.5 #divergence time between both populations q <- 3 #sizeratio between pop2 and pop1 at present system(paste("~/bin/ms/msdir/ms ",samplesize1+samplesize2," ",anzLoci," -t ",theta," -r 20 1000 -I 2 ",samplesize1," ",samplesize2," -m 1 2 ",mig," -m 2 1 ",mig," -n 2 ",q," -eN ",tau," 2 -ej ",tau," 2 1 -eM ",tau," 0 > msoutput",sep="")) zeilen <- readLines("msoutput") jsfs <- array(rep(0,(samplesize1+1)*(samplesize2+1)),c((samplesize1+1), (samplesize2+1))) for(z in 0:(anzLoci-1)*(4+samplesize1+samplesize2)+7) { locus <- as.matrix(data.frame( strsplit(zeilen[z:(z+samplesize1+samplesize2-1)],""))) for(i in 1:nrow(locus)) { x <- sum(locus[i,1:samplesize1]=="1")+1 y <- sum(locus[i,(samplesize1+1):(samplesize1+samplesize2)]=="1")+1 jsfs[x,y] <- jsfs[x,y] + 1 } } load("Modellfeld.rda") summaryStat <- sumstat(jsfs) schaetze(summaryStat,Modellfeld,method="cent") schaetze(summaryStat,Modellfeld,method="wcent") schaetze(summaryStat,Modellfeld,method="wbin") schaetze(summaryStat,Modellfeld,method="ibbin") }