Testing Pipeline Generality

Testing the fragility of the MS2 pipeline.


Download: Notebook

Written by: Ciera Martinez and Samantha Tang

8_testingStacks_ST

Testing Stacks

Purpose: We saw in Notebook 7_consolidated_ST.ipynb that our pipeline works mostly for Ciera's faint MS2 dot video's, but we want to see how robust it is for different types of MS2 videos. So we are taking some very good example data taken from Augusto.

Augusto's Data Full Data: https://www.dropbox.com/sh/umutoxcy3ebdanj/AADdV_ns2xlkxXzxli7zImvCa?dl=0&lst=

Data Example: https://www.dropbox.com/sh/umutoxcy3ebdanj/AAAO6vajYWaSfYa81C9fHG2oa/2018-01-05/EVE_D9?dl=0&lst=&subfolder_nav_tracking=1

In [2]:
import numpy as np
import pandas as pd
import czifile
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import seaborn as sns

from skimage.feature import blob_doh, blob_dog, blob_log
from skimage.color import rgb2gray #to turn into grayscale

from ipywidgets import interact #for interactions

#clustering
from sklearn.cluster import DBSCAN
from skimage import filters, measure, segmentation, feature, img_as_ubyte, morphology

from scipy import ndimage as ndi
In [3]:
# reading in images
# Example EVE_D9
array1 = czifile.imread("/Users/cierapink/Desktop/EVE_D9/EVE_D9-14.czi") 

# (time, channel, z-stack, y-axis, and x-axis), 0 is the dot's channel
array1 = array1.squeeze() #get rid of unwanted channels
array1.shape
Out[3]:
(10, 2, 21, 256, 1024)
In [4]:
#preprocessing: 
zstack = array1[9, 0, :, ...] #MS2 (dots) #FIRST TIMEPOINT
IM_MAX = np.max(zstack, axis=0) #maximum projection 
image_gray = rgb2gray(IM_MAX)
image_gray
Out[4]:
array([[ 801, 1487, 1549, ...,  817, 1461,  802],
       [ 844, 2523, 1552, ..., 1648, 1098, 2027],
       [1152, 1929,  598, ..., 1264, 3035,  819],
       ...,
       [1142, 1660, 2226, ...,  997,  915, 1071],
       [2841, 1161, 1266, ..., 1666, 1194, 1855],
       [ 797, 2255, 2011, ..., 1673, 1857,  745]], dtype=uint16)
In [5]:
#original image: 
plt.imshow(IM_MAX);

MS2 dots

In [6]:
#function to display image and create dataframe: 

def show_blobs(blobs_coords): 
    
    fig, ax = plt.subplots() #created ax just to add patches
    plt.imshow(IM_MAX)

    for idx in range(len(blobs_coords)):
        for blob in blobs_coords: 
            y, x, r = blob
            c = plt.Circle((x, y), r, color="lime", linewidth=2, fill=False)
            ax.add_patch(c)


    #make into dataframe first (no cleaning)
    x = [arr[1] for arr in blobs_coords] #note, the x and y coordinates are returned in opposite order hence the indexing
    y = [arr[0] for arr in blobs_coords]
    sd = [arr[2] for arr in blobs_coords]
    timepoint_1 = pd.DataFrame({"dots": np.arange(1, len(blobs_coords)+1), "x":x, "y":y, "sd":sd})
    
    return timepoint_1



def timepoint_df(blobs_coords, tp): 
    #make into dataframe first (no cleaning)
    x = [arr[1] for arr in blobs_coords] #note, the x and y coordinates are returned in opposite order hence the indexing
    y = [arr[0] for arr in blobs_coords]
    sd = [arr[2] for arr in blobs_coords]
    timepoint = pd.DataFrame({"dots": np.arange(1, len(blobs_coords)+1), 
                              "x":x, 
                              "y":y, 
                              "sd":sd, 
                              "timept": np.zeros(len(blobs_coords)) + tp})
    
    return timepoint

Manual Threshold 1

In [9]:
## Test Threshold
## Previous was max_sigma=30, threshold=.08

blobs_dog = blob_dog(image_gray, max_sigma=8, threshold=.04) 
show_blobs(blobs_dog)
Out[9]:
dots x y sd
0 1 712.0 255.0 4.096
1 2 888.0 220.0 1.600
2 3 264.0 214.0 1.600
3 4 472.0 196.0 1.600
4 5 237.0 113.0 1.600
5 6 713.0 82.0 1.600
6 7 512.0 65.0 1.600
7 8 999.0 15.0 1.000

Set Manual Threshold from above

In [10]:
def all_timepoints_combined(total_tp, array):
    
    df = pd.DataFrame() #initalize
    
    for i in range(total_tp): 
        zstack = array[i, 0, :, ...] #MS2 (dots) 
        IM_MAX = np.max(zstack, axis=0) #maximum projection 
        image_gray = rgb2gray(IM_MAX)
        
        blobs_dog = blob_dog(image_gray, max_sigma=8, threshold=.04) ## Change here
        timepoint = timepoint_df(blobs_dog, i)
        df = df.append(timepoint).reset_index(drop=True)
    
    return df
In [11]:
array1_timepoints = all_timepoints_combined(9, array1)
array1_timepoints.head()
Out[11]:
dots x y sd timept
0 1 731.0 255.0 1.6 0.0
1 2 867.0 251.0 1.6 0.0
2 3 248.0 250.0 1.6 0.0
3 4 692.0 247.0 1.6 0.0
4 5 895.0 246.0 1.6 0.0

Look at all those pretty stripes!

In [12]:
#sanity check
temp = array1_timepoints[["x", "y", "timept"]].groupby("timept").agg(list)

@interact(index=(0, 8))
def scatter(index=0): 
    x = temp.iloc[index,:]["x"]
    y = temp.iloc[index,:]["y"]
    plt.scatter(x, y)
    
    #formatting:
    plt.xlim([0, 1200])
    plt.xticks(np.arange(0, 1200, 200))
    plt.ylim([300, 0])
    plt.yticks(np.arange(0, 300, 100))
In [13]:
@interact(i=(0,8))
def show_images(i=5):
    zstack = array1[i, 0, :, ...] #MS2 (dots)
    IM_MAX = np.max(zstack, axis=0) #maximum projection 
    plt.imshow(IM_MAX)
In [14]:
trial1 = array1_timepoints[["x", "y", "timept"]]  
trial1.head()
Out[14]:
x y timept
0 731.0 255.0 0.0
1 867.0 251.0 0.0
2 248.0 250.0 0.0
3 692.0 247.0 0.0
4 895.0 246.0 0.0

Clustering -- Manual Threshold 3

eps = The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.

In [22]:
cluster = DBSCAN(eps=5, min_samples=2) #can specify min distance to be considered neighbors,  min # of samples in neighborhood 
lab = cluster.fit_predict(trial1) #fits clustering algorithm and returns a cluster label

lab
Out[22]:
array([ 0, -1, -1,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
       15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, -1, 29, 30,
       31, 32, 33, 34, -1, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
       47, -1, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
       63,  0, 64, 65,  2,  1,  3,  5,  4,  6,  7,  8,  9, 10, 11, 12, 14,
       16, 15, 17, 19, 20, 21, 22, 23, 66, 24, 25, 26, 27, 28, 29, 30, 31,
       32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 43, 42, 44, 45, 47, 46, 48,
       49, 50, 51, 52, 53, 54, 55, 56, 57, 67, 68, 58, 69, 60, 61, 62, 63,
       -1,  0, 70, 65,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 12, 11, 13,
       14, 16, 15, 17, 18, 19, 20, 22, 21, 25, 66, 23, 26, 28, 29, 30, 32,
       31, 33, 34, 35, 36, 37, 71, 38, 40, 41, 42, 43, 44, 45, -1, 47, 46,
       48, 49, 50, 72, 51, 52, 53, 55, 54, -1, 56, 57, 68, 67, 58, 69, 73,
       59, 60, 61, 63,  0, 70, 74, 65,  2,  1,  3,  4,  6,  7,  8,  9, 11,
       12, 13, 16, 14, 15, 18, 17, 19, 20, 22, 25, 66, 21, 23, 26, 28, 75,
       29, 30, 31, 32, 33, 76, -1, 34, 35, 36, 37, 71, 38, 40, 41, 42, 77,
       44, 78, 45, 47, 46, 48, 49, 50, 72, 51, 52, 53, 55, 54, 56, 57, 79,
       67, 68, 58, 73, 69, 60, 61,  0, 70, 74, 65,  1,  2,  3,  4,  6,  8,
        7, 11, 13, 12, 14, 16, 80, 18, 17, 19, 20, 21, 66, 25, 28, 75, 29,
       30, 31, 32, 33, 34, 35, 81, 36, 71, 40, 38, 41, 42, 77, 44, 78, 82,
       47, 46, 83, 48, 49, 72, 50, 51, 53, 52, 54, 84, 57, 79, 56, 68, 58,
       73, 69, 60, 61,  0, 64, 74, 65, 70,  2,  1,  3,  4,  6,  8,  9, 13,
       11, 12, 16, 14, 18, 17, 19, 20, 66, 21, 25, 85, 28, -1, 75, 29, 30,
       32, 31, 33, 86, 34, 35, 36, 71, 40, 38, 41, 42, 44, 78, 82, 47, 46,
       83, 87, 48, 49, 50, 72, 51, 53, 52, 84, 54, 57, 79, 56, 68, 67, 88,
       58, 73, 69, 60, 61,  0, 64, 74, 65, 70,  1,  2,  3,  4, 89,  8,  6,
        9, -1, 13, 11, 12, 14, 16, 80, 18, 17, 19, 20, 22, 66, 21, 25, -1,
       90, 75, 29, 30, 31, 33, 76, -1, 34, 81, 35, 36, 71, 40, 38, 41, 44,
       77, 42, 78, 82, 46, 47, 83, 48, 49, 50, 72, 52, 53, 54, 84, 57, 79,
       68, 56, 67, 88, 58, 73, 69, 60, 61,  0, 64, 70, 65,  1, 91,  3,  4,
       89,  8,  6,  9, 13, 12, 11, 14, 16, 80, 18, 17, 19, 20, 22, 66, 21,
       25, 85, 90, 75, 29, 92, 76, 86, 34, 81, 35, 36, 71, 40, 41, 44, 42,
       77, 93, 78, 46, 82, 47, 83, 87, 48, 49, 72, 52, 53, 54, 79, 57, 68,
       56, 67, 88, 58, 69, 73, 60, 61,  0, 64, 70, 74, 65, 91,  3,  4, 89,
        8,  6,  9, 13, -1, 12, 11, 16, 14, 80, 18, 17, 20, 22, 21, 66, 25,
       85, 90, -1, 75, 29, 92, 76, 86, 34, 35, 81, 36, 71, 40, 41, 44, 42,
       77, 93, 78, 82, 47, 83, 87, 48, 49, -1, 72, 54, 57, 79, 68, 56, 67,
       88, 58, 73, 60, 61])
In [21]:
plt.figure(figsize=(10, 7))  
plt.scatter(trial1["x"], trial1["y"], c=cluster.labels_);
In [23]:
trial1["DBSCAN_labels"] = lab
trial1.head()
/Users/cierapink/.local/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
Out[23]:
x y timept DBSCAN_labels
0 731.0 255.0 0.0 0
1 867.0 251.0 0.0 -1
2 248.0 250.0 0.0 -1
3 692.0 247.0 0.0 1
4 895.0 246.0 0.0 2
In [24]:
#can check (notice that none of the aggregated points are from the same timepoint--so no overlaps)
# if they do, may want to play with thresholds more
trial1[["DBSCAN_labels", "timept"]].groupby("DBSCAN_labels").agg(list).head(5)
Out[24]:
timept
DBSCAN_labels
-1 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 2.0, 3.0, ...
0 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
1 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
2 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
3 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]

Nucleus Centers - Manual Threshold 4

In [25]:
#using the same array we read in: 
zstack = array1[1, 1, :, ...] #(timepoint, channel, etc.) this one is timepoint 1
zstack.shape

zstack_MAX= np.max(zstack, axis=0)
plt.imshow(zstack_MAX)
plt.show()
In [26]:
# Convert to 8bit
zstack_MAX_8bit = img_as_ubyte(zstack_MAX)
plt.figure(figsize=(14, 16))
plt.imshow(zstack_MAX_8bit);
In [27]:
#### PART 1: Find centers with a conservative threshold
### First perform the conservative threshold on every point in the Z axis with the Otsu threshold
zstack_MAX_8bit_GausFilt = ndi.filters.gaussian_filter(zstack_MAX_8bit, 1) # blur image and remove noise

otsuthresh = filters.threshold_otsu(zstack_MAX_8bit_GausFilt) # grey scale 
zstack_MAX_8bit_GausFilt_localtheshold = zstack_MAX_8bit_GausFilt > otsuthresh

plt.figure(figsize=(14, 16))
plt.imshow(zstack_MAX_8bit_GausFilt_localtheshold, cmap = "Greys");
In [32]:
#### Then Call centers 
### Distance transform
distancetransform_combined_final = ndi.distance_transform_edt(zstack_MAX_8bit_GausFilt_localtheshold)

#### smoothen the distance transform
distancetransform_combined_final_gaus = ndi.filters.gaussian_filter(distancetransform_combined_final, 10)

plt.figure()
plt.imshow(distancetransform_combined_final_gaus, cmap = "Blues");

Manual Threshold 5

This will have to change depending on the From Local_max = feature.peak_local_max(distancetransform_combined_final_gaus, indices = False, min_distance = 40) to Local_max = feature.peak_local_max(distancetransform_combined_final_gaus, indices = False, min_distance = 2)

min_distance : int, optional Minimum number of pixels separating peaks in a region of 2 * min_distance + 1 (i.e. peaks are separated by at least min_distance). To find the maximum number of peaks, use min_distance=1.

In [90]:
#### Retrieve the local maxima from the distance transform
## Gives a true false for every pixel if it is the max local peak?
Local_max = feature.peak_local_max(distancetransform_combined_final_gaus, 
                                   indices = False, min_distance = 1)
Local_max_bigger = ndi.filters.maximum_filter(Local_max, size=10)

#makes a mask so that I can visualize on top of the original image
Local_max_mask = np.ma.array(Local_max_bigger, mask=Local_max_bigger==0)

#Add that mask back into the watershed image
CenterPointArrays = Local_max_bigger

Manual Threshold 6

There a bunch of thresholds and parameters in this section, but it looks like I can get close by tweaking below.

We had to change ndi.filters.gaussian_filter(zstack_MAX_8bit, sigma = 20) to ndi.filters.gaussian_filter(zstack_MAX_8bit, sigma = 5)

Sigma: sigmascalar or sequence of scalars Standard deviation for Gaussian kernel. The standard deviations of the Gaussian filter are given for each axis as a sequence, or as a single number, in which case it is equal for all axes.

In [91]:
#### PART 2: Now make a mask with a permissive threshold that goes all the way to 
## the edges of the nuclei.


image_GausFilt = ndi.filters.gaussian_filter(zstack_MAX_8bit, sigma = 5)
localthresh = filters.threshold_local(image_GausFilt, 41)
image_GausFilt_localtheshold = image_GausFilt > localthresh
image_GausFilt_localtheshold_dilate = morphology.binary_dilation(image_GausFilt_localtheshold)
EdgeMask = image_GausFilt_localtheshold_dilate

#### Part 3: watershed
## Seperate objects
CenterPointArrays_items, num = ndi.label(CenterPointArrays)

## Watershed
watershed_image = morphology.watershed(~zstack_MAX_8bit_GausFilt, CenterPointArrays_items, mask = EdgeMask)

#### Part 4: Clear borders
Watershed_ClearBorders = segmentation.clear_border(watershed_image)

### Itemize
labeled_array_segmentation, num_features_seg = ndi.label(Local_max_mask)

## Don't understand how the CenterPointArrays is actually specifying points,
## I need to figure this out so I can assign nuclei identities
In [92]:
## EdgeMask Gaussian septerated. 

EdgeMask_gauss = ndi.filters.gaussian_filter(EdgeMask, .5)
plt.figure()
plt.imshow(EdgeMask_gauss, "Greys");
In [93]:
## The edges are a problem, but not sure why.
plt.imshow(Local_max_bigger); #located the centers
In [94]:
#get center coordinates (each center is equal to 1), 
#and the coordinates are the indices for row and column of the 1 value in matrix

y_len = len(Local_max_bigger)
x_len = len(Local_max_bigger[1])
centersX = []
centersY = []
for j in range(y_len): 
    for i in range(x_len):
        if Local_max_bigger[j][i] > 0: 
            centersX.append(i)
            centersY.append(j)
        
In [109]:
centers = pd.DataFrame({"x":centersX, 
                        "y": centersY})
plt.imshow(EdgeMask_gauss, "Greys");
plt.scatter(centers["x"], centers["y"], s=.01);
In [98]:
cluster_ft = DBSCAN(eps=1, min_samples=4) 
labels = cluster_ft.fit_predict(centers)
labels
Out[98]:
array([  0,   0,   0, ..., 307, 307, 307])
In [117]:
plt.figure()
plt.scatter(centers["x"], centers["y"], s = .5, c=cluster_ft.labels_, marker=".");
#it's hard to tell that they are different, but the colors 
In [100]:
np.unique(labels) 
Out[100]:
array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
       182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
       195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
       208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
       221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
       234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
       247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259,
       260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
       273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285,
       286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298,
       299, 300, 301, 302, 303, 304, 305, 306, 307])
In [118]:
centers["center_labels"] = labels
In [119]:
centers.head()
Out[119]:
x y center_labels
0 437 4 0
1 438 4 0
2 439 4 0
3 440 4 0
4 441 4 0
In [120]:
timepoint1 = trial1[trial1["timept"] == 1] #test with 1 timepoint of array1

avg_cent = centers.groupby("center_labels").mean() #average of the centers (so that it's only one coordinate)

fig, ax = plt.subplots(1, 2, figsize=(20, 6))
ax[0].imshow(Local_max_bigger, cmap="gray", label="actual") #actual centers
ax[0].scatter(avg_cent.x, avg_cent.y, s=10, color='red', label="averaged centers"); #averaged centers coords
ax[0].set_title("Center Average overlay on 'Actual' Centers")
ax[0].legend()

ax[1].scatter(avg_cent.x, avg_cent.y);
ax[1].scatter(timepoint1["x"], timepoint1["y"]);
# ax[1].set_ylim(1000, 0);
ax[1].set_title("Averaged Centers and Timepoint1 coordinates");
In [121]:
#let's loop through the timepoint 1 coordinates and compare it with all the avg_cent point coordinates, 
#if it's close then i'll associate that MS2 point to that center (and that MS2 dot will be grouped the same label as the centers)
#there's probably a clustering algorithm that optimizes this ... (kmeans on MS2 dots with fixed centers?)

#we're going to be using euclidean distance
def euclidean_dist(x1, y1, x2, y2): 
    return np.sqrt((x1-x2)**2 + (y1-y2)**2)

dist = float('inf') #temp
closest_center_label = -1 #temp
#in the same order as timepoint1 inputs
label_list = []

for i in range(len(timepoint1)): 
    x1 = timepoint1.iloc[i]["x"]
    y1 = timepoint1.iloc[i]["y"]
    
    for j in range(len(avg_cent)): 
        x2 = avg_cent.iloc[j]["x"]
        y2 = avg_cent.iloc[j]["y"]
        
        new_dist = euclidean_dist(x1, y1, x2, y2)
        if new_dist < dist: 
            dist = new_dist
            closest_center_label = avg_cent.index[j]
            
    #add/record in dictionary after searching through all centers
    label_list.append(closest_center_label)
    #reset distance 
    dist = float('inf')
In [122]:
timepoint1["closest_center_labels"] = label_list
/Users/cierapink/.local/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [123]:
timepoint1.head()
Out[123]:
x y timept DBSCAN_labels closest_center_labels
69 734.0 255.0 1.0 0 290
70 709.0 255.0 1.0 64 295
71 247.0 255.0 1.0 65 277
72 896.0 246.0 1.0 2 303
73 693.0 245.0 1.0 1 307
In [124]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
ax[0].scatter(avg_cent.x, avg_cent.y);
ax[0].scatter(timepoint1["x"], timepoint1["y"], color="orange");
for i, txt in enumerate(np.arange(19)): #counter and text ------ notice text/counter need to be same size as label
    ax[0].annotate(txt, (avg_cent.iloc[i]["x"], avg_cent.iloc[i]["y"]))


ax[1].scatter(timepoint1["x"], timepoint1["y"], color="orange");
for i, txt in enumerate(timepoint1["closest_center_labels"]): #counter and text
    ax[1].annotate(txt, (timepoint1.iloc[i]["x"], timepoint1.iloc[i]["y"]))
In [125]:
plt.figure(figsize=(10, 8))

plt.scatter(avg_cent.x, avg_cent.y, label="centers");
plt.scatter(trial1[trial1["DBSCAN_labels"] == -1]["x"], trial1[trial1["DBSCAN_labels"] == -1]["y"], color="lightgreen", label="outliers");
plt.scatter(trial1[trial1["DBSCAN_labels"] != -1]["x"], trial1[trial1["DBSCAN_labels"] != -1]["y"], color="orange", label="clusters");
plt.title("Centers and All Timepoints");
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5));

#supposed outlier points are in lightgreen
#all other MS2 dots are in orange
#center of nuclei are in blue

#but it does seem like there are some light green points near centers 
#not sure if this means that some of our clusterings for -1 are not actually noise
In [126]:
#function
def associate_centers(df):
    
    dist = float('inf') #temp
    closest_center_label = -1 #temp
    #in the same order as trial1 inputs
    label_list = []

    for i in range(len(df)): 
        x1 = df.iloc[i]["x"]
        y1 = df.iloc[i]["y"]

        for j in range(len(avg_cent)): 
            x2 = avg_cent.iloc[j]["x"]
            y2 = avg_cent.iloc[j]["y"]

            new_dist = euclidean_dist(x1, y1, x2, y2)
            if new_dist < dist: 
                dist = new_dist
                closest_center_label = avg_cent.index[j]

        #add/record in dictionary after searching through all centers
        label_list.append(closest_center_label)
        #reset distance 
        dist = float('inf')
        
    return label_list
In [127]:
label_list = associate_centers(trial1)
trial1["closest_center_labels"] = label_list
In [128]:
non_outliers = trial1[trial1["DBSCAN_labels"] != -1]
outliers = trial1[trial1["DBSCAN_labels"] == -1]

fig, ax = plt.subplots(1, 2, figsize=(16, 6))

ax[0].scatter(avg_cent.x, avg_cent.y);
ax[0].scatter(non_outliers["x"], non_outliers["y"], color="orange");
ax[0].scatter(outliers["x"], outliers["y"], color="lightgreen");
for i, txt in enumerate(np.arange(19)): #counter and text
    ax[0].annotate(txt, (avg_cent.iloc[i]["x"], avg_cent.iloc[i]["y"]))

    
ax[1].scatter(non_outliers["x"], non_outliers["y"], color="orange");
seen = []
for i, txt in enumerate(non_outliers["closest_center_labels"]): #counter and text
    if txt not in seen:
        ax[1].annotate(txt, (non_outliers.iloc[i]["x"], non_outliers.iloc[i]["y"]))
        seen.append(txt)

ax[1].scatter(outliers["x"], outliers["y"], color="lightgreen");
seen2 = []
for i, txt in enumerate(outliers["closest_center_labels"]): #counter and text
    if txt not in seen2:
        ax[1].annotate(txt, (outliers.iloc[i]["x"], outliers.iloc[i]["y"]))
        seen2.append(txt)
        
ax[1].set_title("All Timepoints and Center Labels");
In [129]:
trial1
Out[129]:
x y timept DBSCAN_labels closest_center_labels
0 731.0 255.0 0.0 0 290
1 867.0 251.0 0.0 -1 306
2 248.0 250.0 0.0 -1 277
3 692.0 247.0 0.0 1 307
4 895.0 246.0 0.0 2 303
... ... ... ... ... ...
612 473.0 20.0 8.0 88 4
613 243.0 18.0 8.0 58 10
614 963.0 13.0 8.0 73 6
615 749.0 8.0 8.0 60 15
616 443.0 0.0 8.0 61 0

617 rows × 5 columns

In [ ]: