"Two-Sided Non-Compliance: A Threat to Accurate A/B Testing"
Goal :¶
In [ ]:
In [1]:
# data Manipulation - first we check information about data if any problems we will fix it.
# import data_manipulation from AB_test
from AB_experiment import data_manipulation
#create alias to call data_manipulation
dm=data_manipulation()
data='app_data.csv'
column1="group"
column2=["downloaded_app","time_spent(min)"]
quartile1=0.25
quartile3=0.75
info = True
download_df=False
filename='new'
dm.data_info(data,column1,column2,quartile1,quartile3,info,download_df,filename)
Out[1]:
{'1': ['dataframe_shape', {'Observations': 30000, 'Column': 4}],
'2': ['missing_data_info', {'No missing values'}],
'3': ['outliers_info',
[{'variable_name time_spent(min)': 'No outliers present'}]],
'4': ['data_types',
[{'object_values': "['group', 'downloaded_app']"},
{'float_values': '[]'},
{'int_values': ['user_id', 'time_spent(min)']},
{'bool_val': []}]],
'5': ['numerical_Variables', ['user_id', 'time_spent(min)']],
'6': ['Categorical_variables', ['group', 'downloaded_app']],
'7': [{'Unique values count for variable': group
ad 17903
referral 12097},
{'Unique values count for variable': downloaded_app
Yes 18393
No 11607},
{'Unique values count for variable': time_spent(min)
20 2351
18 2271
19 2241
12 2213
13 2212
17 2209
14 2205
10 2196
15 2188
11 2186
16 2151
8 1150
6 1121
7 1119
5 1108
9 1079}],
'8': ['Descriptive statistics-numerical_Variables',
user_id time_spent(min)
count 30000.000000 30000.000000
mean 497244.479467 13.548800
std 289220.271868 4.290116
min 41.000000 5.000000
25% 246691.000000 10.000000
50% 495162.000000 14.000000
75% 747418.250000 17.000000
max 999979.000000 20.000000,
'********************',
'Descriptive statistics-Categorical_variables',
group downloaded_app
count 30000 30000
unique 2 2
top ad Yes
freq 17903 18393,
'********************'],
'9': {'category_stats': [ time_spent(min)
count median mean std min max
group
ad 17903 13.0 12.533654 4.633040 5 20
referral 12097 15.0 15.051170 3.177318 10 20]},
'10': ['Dataframe',
user_id group downloaded_app time_spent(min)
0 784598 ad Yes 13
1 699052 referral Yes 11
2 218829 ad No 7
3 627414 ad Yes 7
4 190259 referral No 10]}
In [ ]:
In [4]:
# Since categorical variable present we will convert it into numerical using categorical_encoding
# import data_manipulation from AB_test
from AB_experiment import data_manipulation
#create alias to call data_manipulation
dm=data_manipulation()
data='app_data.csv'
variables=['downloaded_app']
download_df=True
filename='new'
dm.categorical_encoding(data, variables, download_df, filename)
Out[4]:
[{'Before encoding': {'Variable_name': 'downloaded_app',
'unique_values': array(['Yes', 'No'], dtype=object)},
'After encoding': {'Variable_name': 'downloaded_app_coded',
'unique_values': array([1, 0])}}]
In [12]:
#From above function we have converted variable into numeric variable hence we also convert its datatype into bool for better analysis.
data='new.csv'
change_variables=['downloaded_app_coded']
dtype=['bool']
drop_variables=[]
download_df=True
filename='new'
dm.change_variables(data,change_variables,dtype,drop_variables,download_df,filename)
Out[12]:
[{'Variable1': ['downloaded_app_coded', dtype('bool')]}]
In [13]:
# After changing data types we chacking agian data_info
# import data_manipulation from AB_test
data='new.csv'
column1="group"
column2=["downloaded_app","time_spent(min)"]
quartile1=0.25
quartile3=0.75
info = True
download_df=False
filename='new'
dm.data_info(data,column1,column2,quartile1,quartile3,info,download_df,filename)
Out[13]:
{'1': ['dataframe_shape', {'Observations': 30000, 'Column': 5}],
'2': ['missing_data_info', {'No missing values'}],
'3': ['outliers_info',
[{'variable_name time_spent(min)': 'No outliers present'}]],
'4': ['data_types',
[{'object_values': "['group', 'downloaded_app']"},
{'float_values': '[]'},
{'int_values': ['user_id', 'time_spent(min)']},
{'bool_val': ['downloaded_app_coded']}]],
'5': ['numerical_Variables', ['user_id', 'time_spent(min)']],
'6': ['Categorical_variables',
['group', 'downloaded_app', 'downloaded_app_coded']],
'7': [{'Unique values count for variable': group
ad 17903
referral 12097},
{'Unique values count for variable': downloaded_app
Yes 18393
No 11607},
{'Unique values count for variable': time_spent(min)
20 2351
18 2271
19 2241
12 2213
13 2212
17 2209
14 2205
10 2196
15 2188
11 2186
16 2151
8 1150
6 1121
7 1119
5 1108
9 1079},
{'Unique values count for variable': downloaded_app_coded
True 18393
False 11607}],
'8': ['Descriptive statistics-numerical_Variables',
user_id time_spent(min)
count 30000.000000 30000.000000
mean 497244.479467 13.548800
std 289220.271868 4.290116
min 41.000000 5.000000
25% 246691.000000 10.000000
50% 495162.000000 14.000000
75% 747418.250000 17.000000
max 999979.000000 20.000000,
'********************',
'Descriptive statistics-Categorical_variables',
group downloaded_app downloaded_app_coded
count 30000 30000 30000
unique 2 2 2
top ad Yes True
freq 17903 18393 18393,
'********************'],
'9': {'category_stats': [ time_spent(min)
count median mean std min max
group
ad 17903 13.0 12.533654 4.633040 5 20
referral 12097 15.0 15.051170 3.177318 10 20]},
'10': ['Dataframe',
user_id group downloaded_app time_spent(min) downloaded_app_coded
0 784598 ad Yes 13 True
1 699052 referral Yes 11 True
2 218829 ad No 7 False
3 627414 ad Yes 7 True
4 190259 referral No 10 False]}
In [ ]:
In [ ]:
# From above output info we can say that in our data there is no outliers , no missing values present
# and datatypes of all variables correct
#Now we findout sample size
In [6]:
#fist we findout baseline conversion rate
# import stats_test from AB_test
from AB_experiment import stats_test
#create alias to call stats_test
st=stats_test()
data='new.csv'
column1="group"
column1_value='referral'
a = st.baseline_conversion_rate(data,column1,column1_value,column2='downloaded_app_coded')
b = st.baseline_conversion_rate(data,column1,column1_value,column2='time_spent(min)',bool_var=False,threshold=13.5)
print('downloaded_app',a,'/ntime_spent(min)',b)
downloaded_app_coded {'Baseline conversion rate(p1) of group referral': 0.4877}
time_spent(min) {'Baseline conversion rate(p1) of group referral for greater than or equal to threshold value 13.5': 0.6419}
In [ ]:
In [16]:
#Sample size using baseline conversion rate.
p1= 0.4877
mde=0.02
alpha=0.05
power=0.8
n_side=2
# For variable downloaded_app_coded
a=st.sample_size(p1,mde,alpha,power, n_side)
# For variable time_spent(min)
p1=0.6419
b=st.sample_size(p1,mde,alpha,power, n_side)
print('downloaded_app',a,'/ntime_spent(min)',b)
downloaded_app_coded {'Sample size': 9806}
time_spent(min) {'Sample size': 8985}
In [ ]:
In [29]:
# Now we check assumptions for all combinations to perform statistical tests for AB testing
# import stats_test from AB_test
from AB_experiment import stats_test
#create alias to call stats_test
st=stats_test()
data='new.csv'
sample_size=9806
column1="group"
column1_value1='referral'
column1_value2='ad'
column2="downloaded_app_coded"
alpha=0.05
paired_data=False
# For variable downloaded_app_coded
a=st.AB_Test_assumption(data, sample_size, column1, column1_value1, column1_value2, column2, alpha, paired_data)
# For variable time_spent(min)
sample_size=8985
column2="time_spent(min)"
b=st.AB_Test_assumption(data, sample_size, column1, column1_value1, column1_value2, column2, alpha, paired_data)
print('For downloaded_app variable/n',a,'/n',40*'*','/n For time_spent(min) variable/n',b)
For downloaded_app_coded variable
({'Target variable is boolean data type': 'Use Chi-Squared Test'}, {'Note': 'If our data involve time-to-event or survival analysis (e.g., time until a user completes a task), we can use methods such as the log-rank test'})
****************************************
For time_spent(min) variable
({'Assumption of Normality is not satisfied': 'Non-parametric test => Use Mann-Whitney U test.'}, {'Note': 'If we are comparing more than two groups, such as in an A/B/C testing scenario, we can use Kruskal-Wallis test.'})
C:/Users/VINAYAK/anaconda3/lib/site-packages/scipy/stats/morestats.py:1760: UserWarning: p-value may not be accurate for N > 5000.
warnings.warn("p-value may not be accurate for N > 5000.")
In [ ]:
By checking assumptions we use Chi-Squared Test for variable downloaded_app¶Define the null and alternative hypotheses :
By checking assumptions we perform Non-parametric test Mann-Whitney U test for variable time_spent(min)¶Define the null and alternative hypotheses :
In [30]:
# import stats_test from AB_test
from AB_experiment import stats_test
#create alias to call stats_test
st=stats_test()
# perform chi-square test
data='new.csv'
sample_size=9806
column1='group'
column1_value1='referral'
column1_value2='ad'
column2='downloaded_app_coded'
alpha=0.05
reverse_experiment=False
# For variable downloaded_app_coded
a=st.chi_squared_test(data, sample_size, column1, column1_value1, column1_value2, column2, alpha, reverse_experiment)
# For variable time_spent(min)
sample_size=8985
column2="time_spent(min)"
b=st.mann_whitney_U_test(data, sample_size, column1, column1_value1, column1_value2, column2, alpha, paired_data)
('For downloaded_app variable',a,40*'*','For time_spent(min) variable',b)
Out[30]:
('For downloaded_app variable',
[{'Test name': 'Chi-square test',
'Timestamp': '2023-08-11 13:45:42',
'Sample size': 9806,
'Status': 'We can reject H0 => group ad is more successful',
'P-value': 1.602395622342239e-193,
'alpha': 0.05,
'Test Statistic': 880.6203723014223,
'Confidence Interval': (-0.2217828294823734, -0.19490083358105723)},
{'proportion1': 0.4884, 'proportion2': 0.6967}],
'****************************************',
'For time_spent(min) variable',
{'Test name': 'Mann whitney U test',
'Timestamp': '2023-08-11 13:45:45',
'Sample size': 8985,
'Status': 'We can reject H0 => group referral performs better',
'P-value': 1.2118957304952622e-289,
'alpha': 0.05,
'Test Statistic': 52979345.0,
'Confidence Interval': (2.0, 3.0)})
In [ ]:
Conclusion¶From downloaded_app Variable
From time_spent(min) Variable
In [ ]:
In [ ]:
|