Double-strand break detection¶

In [1]:
import os
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import numpy as np
import pandas as pd
from src.feature_reader import FeatureReader
from src.vis_utils import FeaturePlotter, ImageCropper
from src import stats
from IPython.display import Image, set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')

No features are detected when there is an absence of cells. Where no channel is found, the origin of the error is not clear, as these often have many cells in view. In these cases, hdf5 output file is furthermore tiny (~15Kb).

In [2]:
# base_folder = '/Users/jcboyd/Documents/drug_screen/'
# plate = '25D_20_384_20X_D_F_C3_C5_Decon_20160031_2'
# image_folder = '/Users/jcboyd/Data/25D/%s' % plate

base_folder = '/Users/jcboyd/Documents/drug_screen_widefield/'
plate = '22_384_20X-hNA_D_F_C3_C5_20160032_2016.01.25.16.27.22_MDA468'
image_folder = '/Users/jcboyd/Data/Widefield/%s' % plate

ch5_folder = os.path.join(base_folder, 'cecog_out/%s/hdf5/' % plate)
channels = ['primary__primary4', 'secondary__propagate', 'tertiary__expanded']
In [3]:
fr = FeatureReader(plate, ch5_folder, channels)
fr.read_features(generate_new=True)
fr.save_data()
Reading feature metadata...
Done!
Reading feature data...
[====================================================================================================>] (100%)
Done!
Saving data...
Done!
In [4]:
metadata_folder = os.path.join(base_folder, 'metadata')
metadata_file = 'PlateMap-KPP_MOA.xlsx' # 'PlateMap-KPP utilisée sur MDA 231 et MDA 468(modified).xlsx'
metadata = pd.read_excel(os.path.join(metadata_folder, metadata_file))

fr.join_plate_map(metadata)
In [5]:
# rewrite column headers
df_feats = fr.data.rename(columns={
    'cell_index' : 'cell count',
    'primary__primary4_n2_avg' : 'average intensity',
    'primary__primary4_roisize' : 'roi size',  # nuclear size
    'tertiary__expanded_granu_open_volume_1' : 'granulometry (1)',  # granulometry level 1
    'tertiary__expanded_granu_open_volume_2' : 'granulometry (2)',  # granulometry level 2
    'tertiary__expanded_granu_open_volume_3' : 'granulometry (3)',  # granulometry level 3
    'tertiary__expanded_granu_open_volume_5' : 'granulometry (5)',  # granulometry level 5
    'tertiary__expanded_granu_open_volume_7' : 'granulometry (7)',  # granulometry level 7
    'tertiary__expanded_spotfeatures_avg_intensity' : 'avg intensity',  # average intensity
    'tertiary__expanded_n2_avg' : 'n2 avg',  # normalised average intensity
    'tertiary__expanded_spotfeatures_count' : 'spot count'})  # number of spots

df_feats['spot density'] = (df_feats['spot count'] / df_feats['roi size'])  # spots per area
df_feats['total intensity'] = df_feats['average intensity'] * df_feats['roi size']
In [6]:
from cecog import ccore

def diameter_opening(image_path, diameter=5, threshold=8):
    # import raw image
    cecog_image = ccore.readImageUInt16(image_path)
    # convert to 8-bit
    conv_image = ccore.conversionTo8Bit(cecog_image, 0, 65535, 0, 255)  #, 0, 255) 
    # perform diameter opening
    open_image = ccore.diameter_open(conv_image, diameter, 8)  # treshold?
    # build difference
    diff_image = ccore.substractImages(conv_image, open_image)
    # threshold
    thresh_image = ccore.threshold_image(diff_image, threshold)
    return conv_image, thresh_image
In [7]:
def plot_comparison(well, field, index):
    image_name = '%s - %s(fld %s wv Cy3 - Cy3).tif' % (well[0], well[1:], field)
    image_path = os.path.join(image_folder, image_name)

    ic = ImageCropper(well, field, ch5_folder)
    bounding_boxes = ic.get_bounding_box_list('primary__primary4')
    left, right, top, bottom = bounding_boxes[index]

    conv_image, thresh_image = diameter_opening(image_path)

    crop = conv_image.toArray()[top:bottom, left:right]
    img_crop = np.dstack((crop, crop, crop))
    
    overlay = thresh_image.toArray()[top:bottom, left:right]
    zeroed_crop = crop.copy()
    zeroed_crop[overlay > 0] = 0
    img_spots = np.dstack((zeroed_crop + overlay, crop, crop))
    
    return img_crop, img_spots
In [8]:
fig = plt.figure(figsize=(10, 5))

for i in range(4):
    idx = np.random.randint(df_feats.shape[0])
    well, field, index = df_feats.loc[idx][['well', 'field', 'cell count']]
    
    img_crop, img_spots = plot_comparison(well, field, index)

    ax = fig.add_subplot(2, 4, i + 1)
    ax.imshow(img_crop)
    ax.axis('off')

    ax = fig.add_subplot(2, 4, i + 5)
    ax.imshow(img_spots)
    ax.axis('off')

plt.show()

1. Comparison of DSB detection approaches¶

We see a strong correlation between measuring normalised spot count and a granulometry-based approach.

In [9]:
import itertools
fields = ['spot density', 'granulometry (2)', 'granulometry (3)', 'n2 avg']

df_plot = df_feats.groupby('well')[fields].mean().reset_index()
fig = plt.figure(figsize=(10, 8))

for i, (field1, field2) in enumerate(itertools.combinations(fields, 2)):
    ax = fig.add_subplot(3, 2, i + 1)
    sns.regplot(field1, field2, data=df_plot, fit_reg=True, scatter_kws={'s': 5}, ax=ax)
    corr = np.corrcoef(df_plot[field1], df_plot[field2])[0, 1]
    ax.set_title(r'%s vs. %s ($\rho=%0.2f$)' % (field1, field2, corr))

plt.tight_layout()
plt.show(fig)

Identify outlier well¶

In [10]:
df_ratio = df_feats.groupby('well')[['spot density', 'granulometry (2)']].mean().reset_index()
df_ratio['ratio'] = df_ratio['granulometry (2)'] / df_ratio['spot density']

Identify outlier (i.e. point with greatest perpendicular distance to trendline i.e. dy/dx much lower than trendline gradient).

In [11]:
print(df_ratio.loc[df_ratio['ratio'].argmin()])
print(df_ratio.loc[df_ratio['ratio'].argmax()])
well                      K01
spot density        0.0119787
granulometry (2)    0.0272633
ratio                 2.27598
Name: 219, dtype: object
well                       B17
spot density        0.00328116
granulometry (2)     0.0353842
ratio                   10.784
Name: 33, dtype: object

Visualise the four positions of the outlier (in granulometry-density correlation), well G17. That is, unusually dense for the given granulometry. This is therefore where the two approaches disagree most.

In [12]:
well = 'K01'
fig = plt.figure(figsize=(8, 6))

for i in range(4):
    ax = fig.add_subplot(2, 2, i + 1)
    ic = ImageCropper(well, '0%s' % (i + 1), ch5_folder)
    ic.plot_image(ax)

plt.tight_layout()
plt.show(fig)

Spot count vs. roi size¶

In [13]:
df_plot = df_feats[(df_feats['well'] == 'A01') & (df_feats['spot count'] < 30) & 
                   (df_feats['roi size'] < 2500)]
In [14]:
g = sns.jointplot(x='roi size', y='spot count', data=df_plot, kind='hex', color='k')
plt.show(fig)

The curve artefacts at the base of the cloud are the result of computing spot density on cells with a single spot, two spots, etc., giving density = $\frac{1}{area}$, $\frac{2}{area}$, etc. as area is fairly constant across cells.

2. Analysis¶

2.1 The Z-prime test (positive vs. negative controls)¶

The HTS Z-prime test (Zhang et al., 1999) is really intended for comparing the distributions of a per-well readout over many replications. Given a single well, the Z-prime test is not appropriate, and taking individual cells as replications is nonsensical, given that there are obviously modifying effects of the randomly varying cell cycle phases, as well as the drug's modifying effect on cell morphologies. These differences, which, in true replication, are marginalised by the law of large numbers, render the test meaningless.

In [15]:
pool_contents = ['DMSO', 'None', 'Olaparib', 'Cisplatine']
df_pools = df_feats[df_feats['content'].isin(pool_contents)]

fig, ax = plt.subplots(figsize=(8, 5))

sns.boxplot(x='content', y='spot count', data=df_pools,
            ax=ax, palette=sns.color_palette("Blues"))
ax.set_ylim([0, 50])
plt.show(fig)
In [16]:
stats.z_prime(df_feats[df_feats['content'] == 'DMSO']['spot count'],
              df_feats[df_feats['well'] == 'G17']['spot count'])
Out[16]:
nan

2.2 Significant results¶

In [17]:
df_med = pd.DataFrame(df_feats.groupby('well')['spot count'].median())
df_sort = df_med.sort_values('spot count', ascending=False).reset_index()
In [18]:
k = 5
top_k = df_sort.head(k)['well'].get_values().tolist()
df_top_k = df_feats[df_feats['well'].isin(top_k)]

df_dmso = df_feats[df_feats['content'] == 'DMSO'].copy(deep=True)
df_dmso['well'] = 'DMSO'

top_k.append('DMSO')
df_plot = pd.concat([df_top_k, df_dmso])
In [19]:
fig, ax = plt.subplots(figsize=(8, 5))
sns.boxplot(x='well', y='spot count', data=df_plot, order=top_k,
            palette=sns.color_palette('Blues'), ax=ax)
ax.set_ylim([0, 100])
plt.show()

The selection bias appears to cause a lack of exchangeability between high and low cells.

2.3 Results by spot count¶

We proceed with the spot count approach:

In [20]:
feature = 'spot count'
In [21]:
df_non_dmso = df_feats[~df_feats['content'].isin(['DMSO', 'None'])]
df_dmso = df_feats[df_feats['content'] == 'DMSO']
df_empty = df_feats[df_feats['content'] == 'None']

BONFERRONI_THRESHOLD = stats.P_VALUE_THRESHOLD / len(df_non_dmso['well'].unique())
print(BONFERRONI_THRESHOLD)
6.71140939597e-05

We use a K-S test to compare with the negative control, with a Bonferonni correction on a 0.01 p-value threshold. Note this is an especially conservative correction.

In [22]:
df_p_scores = df_non_dmso.groupby('well')[feature].agg(
    lambda x: stats.kolmogorov_smirnov(df_dmso[feature], x)).reset_index()

df_sig = df_p_scores[df_p_scores[feature] < BONFERRONI_THRESHOLD]
df_sig.sort_values('spot count').reset_index().drop('index', axis=1).head(10)
Out[22]:
well spot count
0 G01 3.243319e-22
1 P22 2.766204e-20
2 C03 5.733215e-19
3 M15 2.917651e-15
4 K11 7.949445e-14
5 E21 5.787346e-12
6 O11 1.005264e-11
7 K17 1.406564e-11
8 C23 1.805451e-11
9 E19 1.865369e-11

We repeat the above with a Benjamini-Hochberg correction.

In [23]:
k = stats.benjamini_hochberg(df_p_scores[feature])
df_sig = df_p_scores.sort_values(feature).reset_index().head(k)
print('<Benjamini-Hochberg>\n<%s>\nNumber of hits: %d' % (feature, len(df_sig)))
<Benjamini-Hochberg>
<spot count>
Number of hits: 91

In Perelman et al. (2004), they note, "At low concentrations, the histogram for the total DNA content has the characteristic bimodal shape reflecting a mixture of G1, S, and G2/M cell populations. G2 and M populations may be distinguished by 2D display of total DNA signal against nuclear area (25)." (page 2)

In [24]:
# Clip extreme outliers for better clustering
THRESHOLD = 250000
df_threshold = df_feats[df_feats['total intensity'] < THRESHOLD]

Sample distribution of spot count. Recall a histogram becomes the probability density function in the limit.

In [25]:
fig = plt.figure(figsize=(6, 4))

well = 'A09'
ax = fig.add_subplot(1, 1, 1, xlabel='total intensity (px)')
dist = sns.distplot(df_threshold['total intensity'], kde=True, ax=ax, color='blue', bins=50)
ax.grid('on')
plt.show(fig)
In [26]:
INTENSITY_THRESHOLD = 130000

fig, ax = plt.subplots(figsize=(6, 4))

low_densities = df_feats[df_feats['total intensity'] <= INTENSITY_THRESHOLD]['spot density']
sns.distplot(low_densities, ax=ax, hist=False, color='blue', bins=30, label='G1')
high_densities = df_feats[df_feats['total intensity'] > INTENSITY_THRESHOLD]['spot density']
sns.distplot(high_densities, ax=ax, hist=False, color='red', bins=30, label='G2')

ax.legend([r'$G_1$', r'$G_2$'])
ax.grid('on')
ax.set_xlim([0, 0.02])
ax.set_xlabel('spot density')

plt.show()

2.5 Adjusted results¶

Here we explore the effect modification of the cell cycle phase by identifying significant results in the cell state subpopulations separately.

In [27]:
# find high/low intensity cells in DMSO wells
df_dmso_low = df_dmso[df_dmso['total intensity'] <= INTENSITY_THRESHOLD]
df_dmso_high = df_dmso[df_dmso['total intensity'] > INTENSITY_THRESHOLD]

# find high/low intensity cells in perturbation wells
df_non_dmso_low = df_non_dmso[df_non_dmso['total intensity'] <= INTENSITY_THRESHOLD]
df_non_dmso_high = df_non_dmso[df_non_dmso['total intensity'] > INTENSITY_THRESHOLD]

# calculate statistics for low intensity perturbations
df_p_low = df_non_dmso_low.groupby('well')[feature].agg(
    lambda x: stats.kolmogorov_smirnov(df_dmso_low[feature], x)).reset_index()

# calculate statistics for high intensity perturbations
df_p_high = df_non_dmso_high.groupby('well')[feature].agg(
    lambda x: stats.kolmogorov_smirnov(df_dmso_high[feature], x)).reset_index()

Significance in both subpopulations (Bonferonni correction)¶

Consider those wells have having a simulataneous effect on both cell cycle phases (Bonferonni correction):

In [28]:
g1_cells = set(df_p_low[df_p_low[feature] < BONFERRONI_THRESHOLD]['well'])
g2_cells = set(df_p_high[df_p_high[feature] < BONFERRONI_THRESHOLD]['well'])
joint = g1_cells.intersection(g2_cells)

print('<%s>\nNumber low: %d\nNumber high: %d\nJoint: %d' % 
      (feature, len(g1_cells), len(g2_cells), len(joint)))
print joint
<spot count>
Number low: 57
Number high: 15
Joint: 4
set(['K11', 'G01', 'I13', 'P22'])

Significance in both subpopulations (Benjamini-Hochberg correction)¶

Consider those wells have having a simulataneous effect on both cell cycle phases (Benjamini-Hochberg correction):

In [29]:
k = stats.benjamini_hochberg(df_p_low[feature])
g1_cells = set(df_p_low.sort_values(feature).reset_index().head(k)['well'])

k = stats.benjamini_hochberg(df_p_high[feature])
g2_cells = set(df_p_high.sort_values(feature).reset_index().head(k)['well'])

joint = g1_cells.intersection(g2_cells)

print('<%s>\nNumber low: %d\nNumber high: %d\nJoint: %d' % 
      (feature, len(g1_cells), len(g2_cells), len(joint)))
print joint
<spot count>
Number low: 83
Number high: 26
Joint: 11
set(['K13', 'C23', 'K11', 'K17', 'E21', 'K09', 'E05', 'I13', 'P22', 'C03', 'G01'])