import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn.linear_model import LinearRegression


## load data stored locally
us_covid = pd.read_csv("national-history.csv")
state_covid = pd.read_csv("all-states-history.csv")
california_unemployment = pd.read_csv("california_unemployment.csv")
florida_unemployment = pd.read_csv("florida_unemployment.csv")
state_unemployment = pd.read_csv('unemployment_data.csv')
NY_Covid_Policy = pd.read_csv("NY_Covid_Policy.csv")
FL_Covid_Policy = pd.read_csv("FL_Covid_Policy.csv")
state_info = pd.read_csv("state_info.csv")

""""
# Load data hosted on github
us_covid = pd.read_csv("https://raw.github.coecis.cornell.edu/erp49/INFO2950/master/clean_us_covid.csv?token=AAACQXVVA3UD74247Z5VQ4DAMU5D6")
state_covid = pd.read_csv("https://raw.github.coecis.cornell.edu/erp49/INFO2950/master/all-states-history.csv?token=AAACQXS4PR75L44XM6TPAEDAMU5YI")
NY_Unemployment = pd.read_csv("https://raw.github.coecis.cornell.edu/erp49/INFO2950/master/NY_UnemploymentClaims.csv?token=AAACQXWPNMK2OKGIZ2567LDAMU52M")
FL_Unemployment = pd.read_csv("https://raw.github.coecis.cornell.edu/erp49/INFO2950/master/FL_UnemploymentClaims.csv?token=AAACQXT7H5WUS3EFUR6VQI3AMU54G")
US_Unemployment = pd.read_csv("https://raw.github.coecis.cornell.edu/erp49/INFO2950/master/US_UnemploymentClaims.csv?token=AAACQXS7FBAXWKLL6LQ6SP3AMU554")
NY_Covid_Policy = pd.read_csv("https://raw.github.coecis.cornell.edu/erp49/INFO2950/master/NY_Covid_Policy.csv?token=AAACQXRYMI2FADUAAJ3UJ33AMU6A2")
FL_Covid_Policy = pd.read_csv("https://raw.github.coecis.cornell.edu/erp49/INFO2950/master/FL_Covid_Policy.csv?token=AAACQXW64BGDDXWHMVNWSLLAMU6C6")
"""""

#Change date to datetime object
state_covid['date']= pd.to_datetime(state_covid['date'])
us_covid['date']= pd.to_datetime(us_covid['date'])
state_unemployment['Date']= pd.to_datetime(state_unemployment['Date'])

def cleanUnemployment(df):
    """
    Replaces '' with '_' for column heads and puts them to all lowercase letters. Also convert 'date' column to datetime objects.
    
    Parameter df: data frame to manipulate
    Precondition: df is a pandas dataframe
    """
    new_names = list(df.columns.values)
    new_names = [k.lower() for k in new_names]
    new_names = [k.replace(' ', '_') for k in new_names]
    df.columns = new_names
    df['filed_week_ended']= pd.to_datetime(df['filed_week_ended'])
    
cleanUnemployment(florida_unemployment)
cleanUnemployment(california_unemployment)

#Add population, political affiliation, and region information to state_covid dataframe 
state_covid = pd.merge(state_covid, state_info, on='state')


#CALCULATIONS

#Calculate % of Tests that are Positive per Increase in Total Test Results
state_covid['percentPositiveIncrease'] = state_covid['positiveIncrease'] / state_covid['totalTestResultsIncrease']
us_covid['percentPositiveIncrease'] = us_covid['positiveIncrease'] / us_covid['totalTestResultsIncrease']

#Calculate Increase Rates per 100,0000 for Testing, Positives and Deaths per day
perNum = 100000
US_population = 328.2 * 10**6

#Create each new column by dividing cols_of_interest by respective populations
cols_of_interest = ['totalTestResultsIncrease','positiveIncrease','deathIncrease','totalTestResults','positive', 'death']
new_cols = ['testingIncreaseRate', 'positiveIncreaseRate', 'deathIncreaseRate', 'testingRate', 'positiveRate', 'deathRate']

for i in range(len(new_cols)):
    new_c = new_cols[i]
    c = cols_of_interest[i]
    
    us_covid[new_c] = us_covid[c]/US_population * perNum
    state_covid[new_c] = state_covid[c]/state_covid['population'] * perNum

us_covid['percent_positiveIncreaseRate'] = us_covid["positiveIncrease"]/us_covid["totalTestResultsIncrease"] * perNum
state_covid['percent_positiveIncreaseRate'] = state_covid["positiveIncrease"]/state_covid["totalTestResultsIncrease"] * perNum

us_covid['percent_positiveRate'] = us_covid["positive"]/us_covid['totalTestResults']
state_covid['percent_positiveRate'] = state_covid["positive"]/state_covid['totalTestResults']

#Isolate CA and FL from state_covid
CA_covid = state_covid.loc[state_covid['state']=='CA']
FL_covid = state_covid.loc[state_covid['state']=='FL']

CA_unemployment = state_unemployment[['Date', 'CA']]
FL_unemployment = state_unemployment[['Date', 'FL']]
#-------------------------------------------------------------------------------------------------------------------------
#GROUP

#Get totals from most recent date on 3/07/2021
us_totals = us_covid.loc[us_covid['date']==pd.to_datetime('3-07-2021')]
state_totals = state_covid.loc[state_covid['date']==pd.to_datetime('3-07-2021')]

#Isolate columns of interest
us_totals = us_totals[['testingRate', 'positiveRate', 'percent_positiveRate', 'deathRate']]
state_totals = state_totals[['state', 'testingRate', 'positiveRate', 'percent_positiveRate','deathRate', 'Political_Affiliation', 'Region']]

#Group states by political Affiliation
state_politic_totals = state_totals.groupby(by='Political_Affiliation')

Dem_totals = state_politic_totals.get_group('Democratic')
Rep_totals = state_politic_totals.get_group('Republican')

#Group states by Region
state_region = state_totals.groupby(by='Region')

Midwest = state_region.get_group('Midwest')
Northeast = state_region.get_group('Northeast')
South = state_region.get_group('South')
West = state_region.get_group('West')

#Make Same Calculations Specific to People in Democratic and Republican States
state_covid_politic = state_covid[['date','deathIncrease','positiveIncrease', 'totalTestResultsIncrease', 'population', 'Political_Affiliation']].groupby(['Political_Affiliation'])

#Sum all data of Republican and Democratic states on 3/07/2021
Dem_covid = state_covid_politic.get_group('Democratic')
Dem_covid = Dem_covid.groupby('date').sum()
Rep_covid = state_covid_politic.get_group('Republican')
Rep_covid = Rep_covid.groupby('date').sum()

#More calculations
cols_of_interest = ['totalTestResultsIncrease','positiveIncrease','deathIncrease']
new_cols = ['testingIncreaseRate', 'positiveIncreaseRate', 'deathIncreaseRate',]

for i in range(len(new_cols)):
    new_c = new_cols[i]
    c = cols_of_interest[i]
    
    Dem_covid[new_c] = Dem_covid[c]/Dem_covid['population'] * perNum
    Rep_covid[new_c] = Rep_covid[c]/Rep_covid['population'] * perNum
    
dem_states = list(Dem_totals['state'])
rep_states = list(Rep_totals['state'])
Dem_unemployment = state_unemployment[dem_states].mean(axis=1)
Rep_unemployment = state_unemployment[rep_states].mean(axis=1)

unemployment_politic = pd.DataFrame(np.array([state_unemployment['Date'], Dem_unemployment, Rep_unemployment]).transpose(), columns=['Date', 'Democrat', 'Republican'])

print(us_covid['testingRate'].mean())

32704.498431560332


#Downolad CSVs of cleaned data to local drive

"""
us_covid.to_csv (r'C:\Users\kathe\OneDrive\Documents\S2-DESKTOP-S9R0JMM\INFO_2950\Final Project\clean_us_covid.csv', index = False, header=True)
state_covid.to_csv(r'C:\Users\kathe\OneDrive\Documents\S2-DESKTOP-S9R0JMM\INFO_2950\Final Project\clean_state_covid.csv', index = False, header=True)
CA_covid.to_csv (r'C:\Users\kathe\OneDrive\Documents\S2-DESKTOP-S9R0JMM\INFO_2950\Final Project\clean_CA_covid.csv', index = False, header=True)
FL_covid.to_csv (r'C:\Users\kathe\OneDrive\Documents\S2-DESKTOP-S9R0JMM\INFO_2950\Final Project\clean_FL_covid.csv', index = False, header=True)
state_totals.to_csv (r'C:\Users\kathe\OneDrive\Documents\S2-DESKTOP-S9R0JMM\INFO_2950\Final Project\clean_state_totals.csv', index = False, header=True)
"""

  File "<ipython-input-24-c44342768908>", line 9
    """
      ^
SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 22-23: truncated \UXXXXXXXX escape


s1 = state_totals.loc[(state_totals['state'] == 'CA')] 
s2 = state_totals.loc[(state_totals['state'] == 'FL')]    
states_of_interest = pd.concat([s1,s2]).groupby(by='state')

print('US Summary Statistics')
print(us_totals, '\n')
#We don't actually need the means of CA and FL becuase they each only have one row. I did this for formatting.
print('CA and FL Summary Statistics')
print(states_of_interest.mean(), '\n')
print('US States groupe by Political Affiliation Summary Statistics')
print(state_politic_totals.mean())

US Summary Statistics
     testingRate  positiveRate  percent_positiveRate   deathRate
0  110854.606338   8761.786715              0.079039  156.961609 

CA and FL Summary Statistics
         testingRate  positiveRate  percent_positiveRate   deathRate
state                                                               
CA     125647.230732   8861.546464              0.070527  136.980397
FL     104010.874144   8889.246572              0.085465  150.229980 

US States groupe by Political Affiliation Summary Statistics
                         testingRate  positiveRate  percent_positiveRate  \
Political_Affiliation                                                      
Democratic             132762.194179   7626.600673              0.063344   
Republican              96077.249994   9799.530075              0.123030   

                        deathRate  
Political_Affiliation              
Democratic             144.150750  
Republican             150.203195


cols_of_interest = list(Dem_totals.columns.values)[1:-2]
titles = ['Testing Rate Totals', 'Positives per 100,000 Totals', 'Percent Positive of Total Tests','Death Rate Totals']
print(cols_of_interest)
for i in range(len(titles)):
    col = cols_of_interest[i]
    title = 'Democrat vs. Republican State ' + titles[i]
    plt.figure(figsize=(4,2))
    plt.boxplot([Rep_totals[col], Dem_totals[col]], labels=['Republican', 'Democratic'], vert=False)
    plt.title(title, fontsize=15)
    plt.show()
    print('Democratic:', Dem_totals[col].mean())
    print('Republican:', Rep_totals[col].mean())
    ttest = stats.ttest_ind(Rep_totals[col], Dem_totals[col])
    is_significant = 'SIGNIFICANT' if ttest.pvalue < 0.05 else 'NOT SIGNIFICANT'
    print(titles[i] + ":", ttest)
    print(is_significant, '\n')

['testingRate', 'positiveRate', 'percent_positiveRate', 'deathRate']

Democratic: 132762.19417880994
Republican: 96077.24999386033
Testing Rate Totals: Ttest_indResult(statistic=-2.647825761328732, pvalue=0.010867406400506143)
SIGNIFICANT

Democratic: 7626.600673209203
Republican: 9799.53007478047
Positives per 100,000 Totals: Ttest_indResult(statistic=3.5463011201158405, pvalue=0.0008712856003042247)
SIGNIFICANT

Democratic: 0.06334440208823938
Republican: 0.1230298841034768
Percent Positive of Total Tests: Ttest_indResult(statistic=4.691813695053396, pvalue=2.2063470586887677e-05)
SIGNIFICANT

Democratic: 144.15075007752637
Republican: 150.20319512708153
Death Rate Totals: Ttest_indResult(statistic=0.3756884881628496, pvalue=0.7087700622759839)
NOT SIGNIFICANT


print(cols_of_interest)
titles = ['Testing Rates', 'Positives per 100,000', 'Percent Positive of Total Tests', 'Death Rates']

for i in range(len(titles)):
    col = cols_of_interest[i]
    title = 'Region ' + titles[i]
    data = [ Midwest[col], Northeast[col], South[col], West[col] ]
    regions = ['Midwest', 'Northeast', 'South', 'West']
    plt.boxplot(data, labels=regions, vert=False)
    plt.title(title, fontsize=25)
    plt.show()
    anova = stats.f_oneway(Midwest[col], Northeast[col], South[col], West[col])
    is_significant = 'SIGNIFICANT' if anova.pvalue < 0.05 else 'NOT SIGNIFICANT'
    print(titles[i] + ":", anova)
    print(is_significant, '\n')
    if is_significant == 'SIGNIFICANT':
        tukey = pairwise_tukeyhsd(endog=state_totals[col], groups=state_totals['Region'], alpha=0.05)
        print(tukey)

['testingRate', 'positiveRate', 'percent_positiveRate', 'deathRate']

Testing Rates: F_onewayResult(statistic=4.3419400642997505, pvalue=0.008820376813806223)
SIGNIFICANT 

         Multiple Comparison of Means - Tukey HSD, FWER=0.05          
======================================================================
  group1    group2    meandiff  p-adj     lower        upper    reject
----------------------------------------------------------------------
  Midwest Northeast  74707.6149 0.0047   18676.5087 130738.7211   True
  Midwest     South  23905.7597 0.5444  -24003.0457  71814.5652  False
  Midwest      West  34160.5844 0.2917  -16706.7679  85027.9366  False
Northeast     South -50801.8552 0.0604 -103182.6864   1578.9761  False
Northeast      West -40547.0305 0.2178   -95646.807  14552.7459  False
    South      West  10254.8246    0.9  -36561.3493  57070.9985  False
----------------------------------------------------------------------

Positives per 100,000: F_onewayResult(statistic=0.6478333291613055, pvalue=0.5882440066485599)
NOT SIGNIFICANT

Percent Positive of Total Tests: F_onewayResult(statistic=4.574690442292593, pvalue=0.006837864326236779)
SIGNIFICANT 

   Multiple Comparison of Means - Tukey HSD, FWER=0.05    
==========================================================
  group1    group2  meandiff p-adj   lower   upper  reject
----------------------------------------------------------
  Midwest Northeast  -0.0702 0.0111 -0.1278 -0.0126   True
  Midwest     South  -0.0351 0.2433 -0.0843  0.0142  False
  Midwest      West    -0.06 0.0188 -0.1122 -0.0077   True
Northeast     South   0.0352 0.3151 -0.0187   0.089  False
Northeast      West   0.0103    0.9 -0.0463  0.0669  False
    South      West  -0.0249 0.5177  -0.073  0.0232  False
----------------------------------------------------------

Death Rates: F_onewayResult(statistic=4.4129931741535975, pvalue=0.008158962572390084)
SIGNIFICANT 

     Multiple Comparison of Means - Tukey HSD, FWER=0.05     
=============================================================
  group1    group2  meandiff p-adj    lower    upper   reject
-------------------------------------------------------------
  Midwest Northeast  30.0732 0.5523  -30.8454  90.9918  False
  Midwest     South -10.6743    0.9  -62.7621  41.4135  False
  Midwest      West -49.2102 0.0971 -104.5146   6.0942  False
Northeast     South -40.7475 0.2396  -97.6975  16.2024  False
Northeast      West -79.2834 0.0051 -139.1895 -19.3774   True
    South      West -38.5359 0.1966  -89.4357   12.364  False
-------------------------------------------------------------


cols_of_interest = ['testingIncreaseRate', 'positiveIncreaseRate', 'deathIncreaseRate']
ylab = ['New Tests per 100,000 People','New Positives Increase Per 100,000 People','New Deaths per 100,000 People']
titles = ['Testing Increase Rate', 'Positive Increase Rate', 'Death Increase Rate']
for i in range(len(cols_of_interest)):
    plt.figure(figsize=(24,8))
    c = cols_of_interest[i]
    title = titles[i]
    plt.plot( Dem_covid[c].rolling(7, center=True).mean(), alpha=0.8, label='Democratic')
    plt.plot(Rep_covid[c].rolling(7, center=True).mean(), color='red', alpha=0.8, label='Republican')

    plt.xlim(pd.to_datetime('2020-03-1'),pd.to_datetime('2021-03-07'))

    plt.xlabel('Date')
    plt.ylabel(ylab[i])
    plt.title('Democratic vs. Republican {}'.format(title), fontsize=25)
    plt.legend(loc=2, fontsize=16)
    plt.show()
    
    ttest = stats.ttest_ind(Dem_covid[c], Rep_covid[c])
    
    print('Democratic Average of {}: {}'.format(title, Dem_covid[c].mean()))
    print('Republican Average of {}: {}'.format(title, Rep_covid[c].mean()))
    print(ttest)
    
    is_significant = 'SIGNIFICANT' if ttest.pvalue < 0.05 else 'NOT SIGNIFICANT'
    print(is_significant, '\n')

Democratic Average of Testing Increase Rate: 303.6026000566923
Republican Average of Testing Increase Rate: 216.70698215825058
Ttest_indResult(statistic=6.2262532601951275, pvalue=7.612785511483494e-10)
SIGNIFICANT

Democratic Average of Positive Increase Rate: 19.6353677792971
Republican Average of Positive Increase Rate: 23.23090079258996
Ttest_indResult(statistic=-2.4228218786244744, pvalue=0.015615598133711831)
SIGNIFICANT

Democratic Average of Death Increase Rate: 0.37894826192270986
Republican Average of Death Increase Rate: 0.3770474425602762
Ttest_indResult(statistic=0.07734031594345926, pvalue=0.9383716402875895)
NOT SIGNIFICANT


plt.figure(figsize=(16,8))
plt.scatter(Dem_covid['testingIncreaseRate'], Dem_covid['deathIncreaseRate'], alpha=0.4, label='Democratic')
plt.scatter(Rep_covid['testingIncreaseRate'], Rep_covid['deathIncreaseRate'], alpha=0.4, color='red', label='Republican')

lr = LinearRegression()
lr.fit(np.array(Dem_covid['testingIncreaseRate']).reshape(-1,1), Dem_covid['deathIncreaseRate'])
print('Democratic Estimate: y = {}x + {}'.format(lr.coef_[0], lr.intercept_))
plt.plot(Dem_covid['testingIncreaseRate'], lr.predict(np.array(Dem_covid['testingIncreaseRate']).reshape(-1,1)), color='blue')

lr.fit(np.array(Rep_covid['testingIncreaseRate']).reshape(-1,1), Rep_covid['deathIncreaseRate'])
print('Republican Estimate: y = {}x + {}'.format(lr.coef_[0], lr.intercept_))
plt.plot(Rep_covid['testingIncreaseRate'], lr.predict(np.array(Rep_covid['testingIncreaseRate']).reshape(-1,1)), color='red')

plt.xlabel('Testing Increase Rate')
plt.ylabel('Death Increase Rate')
plt.title('Death Increase Rate vs. Testing Increase Rate')
plt.legend()
plt.show()

Democratic Estimate: y = 0.0007570574533882543x + 0.1491036506817379
Republican Estimate: y = 0.0017374746060789788x + 0.0005245641003060464


us_covid['state'] = 'US'
print(us_covid['testingRate'].mean())

y_coord = [900, 110, 1.4]
timeframe = [['2020-03-1','2021-03-07'],['2020-05-4','2021-03-07'],['2020-05-4','2020-11-4']]
time_frame_titles = ['Full Time Frame', 'After Reopening on 5/04/2020','6 Months After Reopening on 5/04/2020']

for i in range(len(cols_of_interest)):
    plt.figure(figsize=(24,8))
    c = cols_of_interest[i]
    title = titles[i]
    
    plt.plot( CA_covid['date'], CA_covid[c].rolling(7, center=True).mean(), alpha=0.8, label='CA')
    plt.plot(FL_covid['date'], FL_covid[c].rolling(7, center=True).mean(), color='red', alpha=0.8, label='FL')
    plt.plot(us_covid['date'], us_covid[c].rolling(7, center=True).mean(), color='black', alpha=0.8, label='US')
    
    plt.xlim(pd.to_datetime('2020-03-1'),pd.to_datetime('2021-03-07'))
    
    plt.axvline(x=pd.to_datetime('2020-05-4'), color='tomato')
    plt.text(pd.to_datetime('2020-05-5'), y_coord[i], 'FL Stage 1 Reopening', fontsize=16, color='tomato')
    
    plt.axvline(x=pd.to_datetime('2020-06-3'), color='tomato')
    plt.text(pd.to_datetime('2020-06-4'), y_coord[i]*0.95, 'FL Stage 2 Reopening', fontsize=16, color='tomato')
    
    plt.axvline(x=pd.to_datetime('2020-09-25'), color='tomato')
    plt.text(pd.to_datetime('2020-09-26'), y_coord[i], 'FL Full Reopening', fontsize=16, color='tomato')
    
    plt.axvline(x=pd.to_datetime('2020-11-4'), linestyle='--', color='grey')
    plt.text(pd.to_datetime('2020-11-5'), y_coord[i], '6 months after Reopening', fontsize=16, color='grey')

    plt.xlabel('Date')
    plt.ylabel(title)
    plt.title('California vs. Florida {}'.format(title), fontsize=25)
    plt.legend(loc=2, fontsize=16)
    plt.show()
    
    for j in range(len(timeframe)):
        CA_subset = CA_covid.loc[(CA_covid['date'] >= pd.Timestamp(timeframe[j][0])) & (CA_covid['date'] <= pd.Timestamp(timeframe[j][1]))]
        FL_subset = FL_covid.loc[(FL_covid['date'] >= pd.Timestamp(timeframe[j][0])) & (FL_covid['date'] <= pd.Timestamp(timeframe[j][1]))]
        us_subset = us_covid.loc[(us_covid['date'] >= pd.Timestamp(timeframe[j][0])) & (us_covid['date'] <= pd.Timestamp(timeframe[j][1]))]
        tukey_data = pd.concat([CA_subset, FL_subset, us_subset])
        
        print(time_frame_titles[j])
        print('CA Average: {}'.format(CA_subset[c].mean()))
        print('FL Average: {}'.format(FL_subset[c].mean()))
        print('US Average: {}'.format(us_subset[c].mean()))
        
        anova = stats.f_oneway(CA_subset[c], FL_subset[c], us_subset[c])
        is_significant = 'SIGNIFICANT' if anova.pvalue < 0.05 else 'NOT SIGNIFICANT'
        print(anova)
        print(is_significant, '\n')
        if is_significant == 'SIGNIFICANT':
            tukey = pairwise_tukeyhsd(endog=tukey_data[c], groups=tukey_data['state'], alpha=0.05)
            print(tukey)
        print('\n')

32704.498431560332

Full Time Frame
CA Average: 340.50386812323114
FL Average: 279.5989363011025
US Average: 297.9908846231975
F_onewayResult(statistic=7.738448402126859, pvalue=0.00045967284764869684)
SIGNIFICANT 

 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
======================================================
group1 group2 meandiff p-adj   lower    upper   reject
------------------------------------------------------
    CA     FL -60.9049  0.001 -98.1595 -23.6503   True
    CA     US  -42.513 0.0205 -79.7676  -5.2584   True
    FL     US  18.3919 0.4786 -18.7872  55.5711  False
------------------------------------------------------


After Reopening on 5/04/2020
CA Average: 402.0641669787608
FL Average: 331.2693166761591
US Average: 352.5139614346654
F_onewayResult(statistic=10.901738232167286, pvalue=2.092225537560887e-05)
SIGNIFICANT 

 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
======================================================
group1 group2 meandiff p-adj   lower    upper   reject
------------------------------------------------------
    CA     FL -70.7949  0.001 -107.323 -34.2667   True
    CA     US -49.5502 0.0043 -86.0783 -13.0221   True
    FL     US  21.2446 0.3607 -15.2835  57.7728  False
------------------------------------------------------


6 Months After Reopening on 5/04/2020
CA Average: 252.6109999470836
FL Average: 246.9361520596759
US Average: 243.50516494556726
F_onewayResult(statistic=0.39304990260091477, pvalue=0.6751838208151788)
NOT SIGNIFICANT

Full Time Frame
CA Average: 24.01466755563204
FL Average: 23.89582411808338
US Average: 23.553175352034263
F_onewayResult(statistic=0.036184020963477934, pvalue=0.9644639333475139)
NOT SIGNIFICANT 


After Reopening on 5/04/2020
CA Average: 28.330687482666875
FL Average: 28.334504652936147
US Average: 27.29888233338877
F_onewayResult(statistic=0.18140674188779393, pvalue=0.8341258238304653)
NOT SIGNIFICANT 


6 Months After Reopening on 5/04/2020
CA Average: 12.126168955158308
FL Average: 19.51524237601211
US Average: 13.761458899484495
F_onewayResult(statistic=25.420779821536126, pvalue=2.7485678444325342e-11)
SIGNIFICANT 

Multiple Comparison of Means - Tukey HSD, FWER=0.05 
====================================================
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
    CA     FL   7.3891  0.001  4.8307  9.9475   True
    CA     US   1.6353 0.2912 -0.9231  4.1937  False
    FL     US  -5.7538  0.001 -8.3122 -3.1954   True
----------------------------------------------------

Full Time Frame
CA Average: 0.37122058856336126
FL Average: 0.4038440322636638
US Average: 0.4219357132092284
F_onewayResult(statistic=1.9621018137149224, pvalue=0.14104988751355035)
NOT SIGNIFICANT 


After Reopening on 5/04/2020
CA Average: 0.4265407043428415
FL Average: 0.46655076116513294
US Average: 0.446123879167755
F_onewayResult(statistic=0.9582523708168573, pvalue=0.3839447026559414)
NOT SIGNIFICANT 


6 Months After Reopening on 5/04/2020
CA Average: 0.21255140158472935
FL Average: 0.39583414312325443
US Average: 0.2667737207042509
F_onewayResult(statistic=42.31457825594809, pvalue=7.989224827652206e-18)
SIGNIFICANT 

Multiple Comparison of Means - Tukey HSD, FWER=0.05
===================================================
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
    CA     FL   0.1833  0.001  0.1352 0.2314   True
    CA     US   0.0542 0.0226  0.0061 0.1023   True
    FL     US  -0.1291  0.001 -0.1772 -0.081   True
---------------------------------------------------


plt.figure(figsize=(16,8))

predictor = 'testingIncreaseRate'
response = 'positiveIncreaseRate'

plt.scatter(CA_covid[predictor], CA_covid[response], alpha=0.4, label='California')
plt.scatter(FL_covid[predictor], FL_covid[response], alpha=0.4, color='red', label='Florida')

lr = LinearRegression()
lr.fit(np.array(CA_covid[predictor]).reshape(-1,1), CA_covid[response])
print('California Estimate: y = {}x + {}'.format(lr.coef_[0], lr.intercept_))
plt.plot(CA_covid[predictor], lr.predict(np.array(CA_covid[predictor]).reshape(-1,1)), color='blue')

lr.fit(np.array(FL_covid[predictor]).reshape(-1,1), FL_covid[response])
print('Florida Estimate: y = {}x + {}'.format(lr.coef_[0], lr.intercept_))
plt.plot(FL_covid[predictor], lr.predict(np.array(FL_covid[predictor]).reshape(-1,1)), color='red')

plt.xlabel('Testing Rate')
plt.ylabel('Positive Increase Rate')
plt.title('Positive Increase Rate vs. Testing Rate')
plt.legend()
plt.show()

plt.figure(figsize=(16,8))

predictor = 'testingRate'
response = 'positiveRate'

plt.scatter(CA_covid[predictor], CA_covid[response], alpha=0.4, label='California')
plt.scatter(FL_covid[predictor], FL_covid[response], alpha=0.4, color='red', label='Florida')

lr = LinearRegression()
lr.fit(np.array(CA_covid[predictor]).reshape(-1,1), CA_covid[response])
print('California Estimate: y = {}x + {}'.format(lr.coef_[0], lr.intercept_))
plt.plot(CA_covid[predictor], lr.predict(np.array(CA_covid[predictor]).reshape(-1,1)), color='blue')

lr.fit(np.array(FL_covid[predictor]).reshape(-1,1), FL_covid[response])
print('Florida Estimate: y = {}x + {}'.format(lr.coef_[0], lr.intercept_))
plt.plot(FL_covid[predictor], lr.predict(np.array(FL_covid[predictor]).reshape(-1,1)), color='red')

plt.xlabel('Testing Rate')
plt.ylabel('Positive Increase Rate')
plt.title('Positive Increase Rate vs. Testing Rate')
plt.legend()
plt.show()

California Estimate: y = 0.09494737183836505x + -8.31527982346601
Florida Estimate: y = 0.0914051263033822x + -1.5294061646749952

California Estimate: y = 0.07231418345581x + -293.1136465012023
Florida Estimate: y = 0.08551044849506438x + -3.371142379965022


import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col #Closest thing to the 'estout'pretty tables

FloCal = FL_covid.append(CA_covid)
FloCal['is_Florida'] = pd.get_dummies(FloCal['state'])['FL']
FloCal['constant'] = 1
FloCal_recent = FloCal.set_index('date').loc['2021']


#Train regression to view testing rate effect ONLY on increase in positives
reg0 = sm.OLS(FloCal['positiveRate'],FloCal[['testingRate','constant']]).fit()
#Train regression to view state and testing rate effect on increase in positives
reg1 = sm.OLS(FloCal['positiveRate'],FloCal[['is_Florida','testingRate','constant']]).fit()
#Train regression to do same as reg0, limiting data to only 2021
reg2 = sm.OLS(FloCal_recent['positiveRate'],FloCal_recent[['is_Florida','testingRate','constant']]).fit()

reg_names = ["OLS I","OLS II","OLS III"]
reg_Order = ['testingRate','is_Florida','constant']
print (summary_col([reg0,reg1,reg2],stars=True,float_format='%0.2f', model_names=reg_names,regressor_order=reg_Order,
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs))}))

===========================================
              OLS I     OLS II    OLS III  
-------------------------------------------
testingRate 0.08***   0.08***    0.08***   
            (0.00)    (0.00)     (0.00)    
is_Florida            767.86***  1154.30***
                      (30.91)    (46.87)   
constant    -86.51*** -525.27*** -110.31   
            (29.57)   (28.25)    (181.61)  
R-squared   0.96      0.98       0.94      
            0.96      0.98       0.94      
N           773       773        132       
===========================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


#Difference in Difference Model Before and After Sept 25, 2020 (Florida's Complete Reopening)
import datetime as dt
treatment_date = dt.datetime(2020,9,25)
FloCal['Post'] = np.where(FloCal['date'] >= treatment_date, 1, 0)
FloCal['Treatment'] = FloCal['is_Florida']
FloCal['Post x Treatment'] = FloCal['Post']*FloCal['Treatment']

#Train regression to assess effect of being in Florida after it's complete Reopening
reg = sm.OLS(FloCal['positiveRate'],FloCal[['Post','Treatment','Post x Treatment','testingRate','constant']]).fit()
print (summary_col([reg],stars=True,float_format='%0.2f',
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.2f}".format(x.rsquared)}))

=============================
                 positiveRate
-----------------------------
Post             -1391.85*** 
                 (43.96)     
Treatment        346.07***   
                 (25.75)     
Post x Treatment 1077.69***  
                 (39.61)     
testingRate      0.09***     
                 (0.00)      
constant         -317.75***  
                 (19.82)     
R-squared        0.99        
                 0.99        
N                773         
R2               0.99        
=============================
Standard errors in
parentheses.
* p<.1, ** p<.05, ***p<.01


unemployment_politic['Difference'] = unemployment_politic['Democrat'] - unemployment_politic['Republican']
diff_before = unemployment_politic.loc[(unemployment_politic['Date'] >= pd.Timestamp('1-1-2016')) & (unemployment_politic['Date'] < pd.Timestamp('3-13-2020'))]
diff_after = unemployment_politic.loc[(unemployment_politic['Date'] >= pd.Timestamp('3-13-2020'))]

plt.figure(figsize=(24,8))
plt.plot(unemployment_politic['Date'], unemployment_politic['Democrat'], alpha=0.8, label='Democratic')
plt.plot(unemployment_politic['Date'], unemployment_politic['Republican'], color='red', alpha=0.8, label='Republican')
plt.xlim(pd.to_datetime('2019-01-1'),pd.to_datetime('2021-03-07'))
plt.title('Democratic vs. Republican Unemployment Rate', fontsize=25)
plt.axvline(x=pd.to_datetime('2020-03-13'), color='grey')
plt.text(pd.to_datetime('2020-03-14'), 5, 'National Emergency Declared', fontsize=16, color='grey')
plt.legend(loc=2, fontsize=16)
plt.show()

plt.figure(figsize=(24,8))
plt.plot(unemployment_politic['Date'], unemployment_politic['Democrat'] - unemployment_politic['Republican'],  alpha=0.8)
plt.axhline(y = diff_before['Difference'].mean(), linestyle='dotted', color='grey')
plt.title('Democratic vs. Republican Difference in Unemployment Rate', fontsize=25)
plt.xlim(pd.to_datetime('2019-01-1'),pd.to_datetime('2021-03-07'))
plt.axvline(x=pd.to_datetime('2020-03-13'), color='grey')
plt.text(pd.to_datetime('2020-03-14'), 2, 'National Emergency Declared', fontsize=16, color='grey')
plt.show()

print('Before COVID National Emergency:', diff_before['Difference'].mean())
print('After COVID National Emergency:', diff_after['Difference'].mean())
ttest = stats.ttest_ind(diff_before['Difference'], diff_after['Difference'])
is_significant = 'SIGNIFICANT' if ttest.pvalue < 0.05 else 'NOT SIGNIFICANT'
print( "Before vs. After COVID National Emergency:", ttest)
print(is_significant, '\n')

Before COVID National Emergency: -0.0070170660856935215
After COVID National Emergency: 1.9776515151515155
Before vs. After COVID National Emergency: Ttest_indResult(statistic=-31.368989958593207, pvalue=6.409661144533248e-39)
SIGNIFICANT


CA_FL_diff = california_unemployment[['filed_week_ended']]
CA_FL_diff['Difference'] = california_unemployment['insured_unemployment_rate']-florida_unemployment['insured_unemployment_rate']
diff_before = CA_FL_diff.loc[(CA_FL_diff['filed_week_ended'] < pd.Timestamp('3-13-2020'))]
diff_after = CA_FL_diff.loc[(CA_FL_diff['filed_week_ended'] >= pd.Timestamp('3-13-2020'))]

#Plotting
plt.figure(figsize=(24,8)) 
plt.plot(california_unemployment['filed_week_ended'], california_unemployment['insured_unemployment_rate'], alpha=0.8)
plt.plot(florida_unemployment['filed_week_ended'], florida_unemployment['insured_unemployment_rate'], alpha=0.8, color='red')
plt.xlim(pd.to_datetime('2019-01-1'),pd.to_datetime('2021-04-07'))
plt.axvline(x=pd.to_datetime('2020-05-4'), color='tomato')
plt.text(pd.to_datetime('2020-05-5'), 3, 'FL Stage 1 Reopening', fontsize=16, color='tomato')
plt.title('California vs. Florida Unemployment Rate Difference', fontsize=25)
plt.axvline(x=pd.to_datetime('2020-03-13'), color='grey')
plt.text(pd.to_datetime('2020-03-14'), 5, 'National Emergency Declared', fontsize=16, color='grey')
plt.show()

plt.figure(figsize=(24,8)) 
plt.plot(california_unemployment['filed_week_ended'], california_unemployment['insured_unemployment_rate']-florida_unemployment['insured_unemployment_rate'], alpha=0.8)
plt.xlim(pd.to_datetime('2019-01-1'),pd.to_datetime('2021-04-07'))
plt.axvline(x=pd.to_datetime('2020-05-4'), color='tomato')
plt.text(pd.to_datetime('2020-05-5'), 3, 'FL Stage 1 Reopening', fontsize=16, color='tomato')
plt.axvline(x=pd.to_datetime('2020-03-13'), color='grey')
plt.text(pd.to_datetime('2020-03-14'), 5, 'National Emergency Declared', fontsize=16, color='grey')
plt.axhline(y = diff_before['Difference'].mean(), linestyle='dotted', color='grey')
plt.title('California vs. Florida Unemployment Rate Difference', fontsize=25)
plt.show()

#TTest
print('Before COVID National Emergency:', diff_before['Difference'].mean())
print('After COVID National Emergency:', diff_after['Difference'].mean())
ttest = stats.ttest_ind(diff_before['Difference'], diff_after['Difference'])
is_significant = 'SIGNIFICANT' if ttest.pvalue < 0.05 else 'NOT SIGNIFICANT'
print( "Before vs. After COVID National Emergency:", ttest)
print(is_significant, '\n')

C:\Users\kathe\anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Before COVID National Emergency: 1.5761666666666672
After COVID National Emergency: 5.903157894736845
Before vs. After COVID National Emergency: Ttest_indResult(statistic=-23.069016770125863, pvalue=2.944077310866634e-82)
SIGNIFICANT

Give Me Liberty or Give Me… Bankruptcy: Democratic vs. Republican Covid Policies¶

Research Question¶

Background Information¶

Data Collection and Cleaning¶

Load and Clean Data¶

Make Calculations and Group Data¶

Data Description¶

National and State data:¶

Unemployment Data (Weekly)¶

Unemployment Data (Monthly)¶

State Information(population, region, political affiliation):¶

State Populations:¶

State Political Affiliations: (Based on 2020 Election)¶

Regions of US According to Census Bureau¶

California Timeline of Covid Policy:¶

Florida Timeline of Covid Policy:¶

Preregistration Statements¶

Diff-In-Diff Model: Effects of Fully Reopening FL¶

Objective:¶

Formula:¶

Hypothesis:¶

Where:¶

Linear Regression Model: Effect of Being in FL on Positive Test Increase Rates¶

Objective:¶

Formula:¶

Hypothesis:¶

Where:¶

ANOVA and Tukey Tests on testing, positives, and death rates of 5 regions of US, and with CA, FL, and US Averages¶

Objective:¶

Hypothesis:¶

Background Information:¶

Data Limitations¶

Exporatory Data Analysis¶

T-Tests of Totals of Republican and Democratic States¶

Analysis¶

ANOVA of US Regions¶

Analysis¶

ANOVA of Democratic and Republican State Increases per Day¶

Analysis¶

Aberation in Data¶

ANOVA of California and Florida Increases per Day¶

Analysis¶

Linear Regression Model: Effect of Being in FL on Positive Test Increase Rates¶

Analysis¶¶

Diff-In-Diff Model: Effects of Fully Reopening FL¶

Analysis¶

Conclusion¶

Data Limitations¶