Adventures in Python: How to Bootstrap for Parameter Confidence Intervals

We can modify the procedure for fitting data from here : Fitting a Line

import numpy
import matplotlib.pyplot as plt
from scipy.optimize import fmin
 
#==============================================
## Generating data to fit
n = 20
xmin = 1
xmax = 11
a=2
b=4
funct=lambda x,a,b:a*x+b
x = linspace(xmin,xmax,4)
y = funct(x,a,b)
#===============================================
 
 
 
#scores fit of functions for a set of parameters
def score((parms)):
    #1 pull off parameters from optimizer input
    A,B=parms
    #2 data you are matching to
    funct=lambda x,a,b:a*x+b
    x = linspace(1,11,20)
    y = funct(x,2,4)
    #3 write function to fit data
    my_guess_function=lambda x,A,B:A*x+B
    #4 write cost function to see how well it fits
    ss=lambda x,y,A,B:((my_guess_function(x,A,B)-y)**2).sum()
    #5 return the fit score for this try
    return ss(x,y,A,B)
 
 
#find the parameters that give us the lowest score
answ=fmin(score,(3,1),full_output=1,maxiter=1000)
 
#Create function from lowest scoring parameters
function=lambda x,A,B:A*x+B
A=answ[0][0]
B=answ[0][1]
X1 = linspace(xmin,xmax,n*5)
Y1= funct(X1,A,B)
 
#===============================================
#plotting the data and lowest scoring (aka best)fit
plt.plot(x,y,'ro',X1,Y1,'-')
plt.legend(('data', 'the lowest scoring fit'),'upper center')
graphname='optimized fit'
plt.title(graphname)
plt.xlabel('time')
plt.show()
#===============================================

1) Figure out what the distribution of your data is at each timepoint:

    Lets say we know that each of those red dots has a normal distribution and
    we have the standard deviation for each sample

    Python lets you create distributions and randomly sample from them
    using the random module.

    Here is an example: let's say we have a mean of 5 and a standard deviation of 2
    for a timepoint and we know our population is normally distributed.

    Using this knowledge, we can resample from the hypothetical population 1000
    times... see below.

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import random

dist=[]
for i in range(1,1000):
    v=random.normalvariate(5.0, 2.0)
    print v
    dist.append(v)


x=np.array(dist)


#Create Histogram of Sampled Data
#==============================================================
fig = plt.figure()
ax = fig.add_subplot(111)
 
 
binnum=20 #number of bins you want to sort the data into
n, bins, patches = ax.hist(x,binnum, normed=1, facecolor='purple', alpha=0.75)
 
ax.set_xlabel('x values')
ax.set_ylabel('Probability')
ax.set_title('Histogram')
ax.set_xlim(0,11)
ax.set_ylim(0,1)
ax.grid(True)
 
plt.show()
#==========================================================================

This creates a resampled population normally distributed around 5.

2) Create Pseudo-Data Sets from Distributions

Now that we know what the distribution looks like for each point we can
randomly resample each data point and create new pseudodata sets that
could represent the larger population.
I have created 9 such pseudo-data sets for the line fit using the code below:

import numpy
import matplotlib.pyplot as plt
from scipy.optimize import fmin

#==============================================
## Generating data to fit
n = 20
xmin = 1
xmax = 11
a=2
b=4
funct=lambda x,a,b:a*x+b
x = linspace(xmin,xmax,4)
y = funct(x,a,b)
#===============================================

#create resampled data sets
#====================================================================
#create vector that will hold shape parameters for each distribution
#(in this case our shape parameter is the standard deviation)
sp=[2,4,5,3]

rs=[] #resample matrix
for i in range(1,10):
    test=[random.normalvariate(y[i],sp[i]) for i in range(0,len(y))]
    rs.append(test)
    
for i in rs:
    print i
#=====================================================================

>>> 
[4.7843939442434049, 16.335795443248493, 23.923415020827772, 28.926647319876341]
[1.4550122401530485, 11.938249400807138, 18.58514566588746, 28.753592173175928]
[8.4116306190572594, 13.739921412912709, 15.298217154055491, 25.072461269093058]
[10.804490320068146, 11.70660584443795, 9.3519437241967207, 22.426218200316015]
[8.1523361887669168, 15.342873727826307, 17.955868012168718, 27.418223311717338]
[4.6100643262417726, 14.725450585717125, 12.801632690166691, 27.866769533245787]
[8.1626813397096498, 11.975600098300783, 21.663261724594363, 28.361739176867761]
[7.774551346614186, 11.701822045751257, 17.199540728133663, 26.062178459008841]
[4.7840722792105623, 18.40165646924935, 25.292649340312746, 20.674055879381079]
>>>

3) Fit Pseudo-Data

import numpy
import matplotlib.pyplot as plt
from scipy.optimize import fmin

## Generating data to fit
#===================================================================

n = 20
xmin = 1
xmax = 11
a=2
b=4
funct=lambda x,a,b:a*x+b
x = linspace(xmin,xmax,4)
y = funct(x,a,b)



##Create resampled data sets
#====================================================================
#create vector that will hold shape parameters for each distribution
#(in this case our shape parameter is the standard deviation)
sp=[2,4,5,3]

rs=[] #resample matrix
for i in range(1,10):
    test=[random.normalvariate(y[i],sp[i]) for i in range(0,len(y))]
    rs.append(test)


## Scoring Function
#=====================================================================
def score((parms)):
        #1 pull off parameters from optimizer input
        A,B=parms
        #2 data you are matching to
        funct=lambda x,a,b:a*x+b
        #3 write function to fit data
        my_guess_function=lambda x,A,B:A*x+B
        #4 write cost function to see how well it fits
        ss=lambda x,y,A,B:((my_guess_function(x,A,B)-y)**2).sum()
        #5 return the fit score for this try
        return ss(x,y,A,B)

    
## Fit Pseudo Data
#====================================================================
#find the parameters that give us the lowest score for each set of
#pseudo-data
FitMat=[]
for ps in rs:
    y=ps
    answ=fmin(score,(3,1),full_output=1,maxiter=1000)
    print answ
  
    #Create function from lowest scoring parameters
    function=lambda x,A,B:A*x+B
    A=answ[0][0]
    B=answ[0][1]
    X1 = linspace(xmin,xmax,n*5)
    Y1= funct(X1,A,B)
    FitMat.append(Y1)

##Plot Pseudo Data and Fits
#===================================================================
#plotting the data and lowest scoring (aka best)fit
for i in range(0,len(rs)):
    plt.plot(x,rs[i],'o',X1,FitMat[i],'-')

plt.legend(('data', 'the lowest scoring fit'),'upper center')
graphname='optimized fit'
plt.title(graphname)
plt.xlabel('time')
plt.show()

4) Examine the variation in the Parameter fits on the Pseudo-data

import numpy
import matplotlib.pyplot as plt
from scipy.optimize import fmin

## Generating data to fit
#===================================================================

n = 20
xmin = 1
xmax = 11
a=2
b=4
funct=lambda x,a,b:a*x+b
x = linspace(xmin,xmax,4)
y = funct(x,a,b)



##Create resampled data sets
#====================================================================
#create vector that will hold shape parameters for each distribution
#(in this case our shape parameter is the standard deviation)
sp=[2,4,5,3]

rs=[] #resample matrix
for i in range(1,10):
    test=[random.normalvariate(y[i],sp[i]) for i in range(0,len(y))]
    rs.append(test)


## Scoring Function
#=====================================================================
def score((parms)):
        #1 pull off parameters from optimizer input
        A,B=parms
        #2 data you are matching to
        funct=lambda x,a,b:a*x+b
        #3 write function to fit data
        my_guess_function=lambda x,A,B:A*x+B
        #4 write cost function to see how well it fits
        ss=lambda x,y,A,B:((my_guess_function(x,A,B)-y)**2).sum()
        #5 return the fit score for this try
        return ss(x,y,A,B)

    
## Fit Pseudo Data
#====================================================================
#find the parameters that give us the lowest score for each set of
#pseudo-data
FitMat=[]
parmA=[]
parmB=[]
for ps in rs:
    y=ps
    answ=fmin(score,(3,1),full_output=1,maxiter=1000)
    print answ
  
    #Create function from lowest scoring parameters
    function=lambda x,A,B:A*x+B
    A=answ[0][0]
    B=answ[0][1]
    X1 = linspace(xmin,xmax,n*5)
    Y1= funct(X1,A,B)
    FitMat.append(Y1)
    parmA.append(A)
    parmB.append(B)

##Figure out Stats on Parm
#==================================================================
print "slope is on average " + str(np.average(parmA))
print "with a standard deviation of " +str(np.std(parmA))


'''
##Plot Pseudo Data and Fits
#===================================================================
#plotting the data and lowest scoring (aka best)fit
for i in range(0,len(rs)):
    plt.plot(x,rs[i],'o',X1,FitMat[i],'-')

plt.legend(('data', 'the lowest scoring fit'),'upper center')
graphname='optimized fit'
plt.title(graphname)
plt.xlabel('time')
plt.show()
'''
>>>
slope is on average 1.92806743556
with a standard deviation of 0.269858064123
>>>

Adventures in Python

Blog Archive

Pages

Thursday, November 29, 2012

How to Bootstrap for Parameter Confidence Intervals

1) Figure out what the distribution of your data is at each timepoint:

2) Create Pseudo-Data Sets from Distributions

3) Fit Pseudo-Data

No comments:

Post a Comment