import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv("~/Downloads/archive_1/NGS.csv")


df.head()


HAND_WRITE_String = { 0: "No Response", 1: "Left", 2: "Right", }

df['HAND_WRITE'] = df['HAND_WRITE'].replace(HAND_WRITE_String)
df.head()


HAND_THROW_String = { 0: "No Response", 1: "Left", 2: "Right", }

df['HAND_THROW'] = df['HAND_THROW'].replace(HAND_THROW_String)
df.head()


WORK_ENVIOR_String = {   
    0: 'NR',
    1: 'Home',
    2: 'Factory',
    3: 'Office',
    4: 'Outdoors',
    5: 'Other',
    6: 'No Job' 
}
df['WORK_ENVIOR'] = df['WORK_ENVIOR'].replace(WORK_ENVIOR_String)
df.head()


df = df.rename(columns={'Unnamed': 'Column_Name'})
df.head()


df = df.rename(columns={df.columns[0]: 'Respondents'})
df.set_index('Respondents', inplace=True)
df.head()


DAYS_W_PERF_String = {   
    0: 'NR',
    1: '1-2',
    2: '3-4',
    3: '5-7',
    4: '0', 
}
df['DAYS_W_PERF'] = df['DAYS_W_PERF'].replace(DAYS_W_PERF_String)
df.head()


mapping = {0: 'NO', 1: 'YES'}
columns_to_map = ['CHEM_EXPOSE', 'PREG', 'FLU_COLD', 'HEAD_INJ', 'ALLERGIES', 'UNK', 'NO_LOSS']
df[columns_to_map] = df[columns_to_map].replace(mapping)
df.head()


df.head()


mapping1 = {0: 'NR', 1: 'YES', 2: 'NO'}
columns_to_map1 = [
    'REGAIN_SMELL', 'MD_ABOUT_LOSS', 'ALLERG_ANIM', 'ALLERG_POLLEN', 'ALLERG_FOOD',
    'ALLERG_DRUG', 'NONE_ABOVE', 'ARTHRITIS', 'DIABETES', 'ULCER', 'HYPERTEN', 'NONE', 'SMOKE'
]
df[columns_to_map1] = df[columns_to_map1].replace(mapping1)
df.head()


mapping2 = {
    0: 'No response', 1: 'Argentina', 2: 'Bolivia', 3: 'Brazil', 4: 'Chile',
    5: 'Colombia', 6: 'Costa Rica', 7: 'Ecuador', 8: 'Guatemala', 9: 'Honduras',
    10: 'Mexico', 11: 'Panama', 12: 'Peru', 13: 'Uruguay', 14: 'Venezuela',
    15: 'Egypt', 16: 'Kenya', 17: 'Malawi', 18: 'Nigeria', 19: 'South Africa',
    20: 'Zambia', 21: 'Zimbabwe', 22: 'Bahrain', 23: 'China', 24: 'Taiwan',
    25: 'Hong Kong', 26: 'India', 27: 'Iran', 28: 'Israel', 29: 'Japan',
    30: 'Kuwait', 31: 'Malaysia', 32: 'Oman', 33: 'Pakistan', 34: 'Saudi Arabia',
    35: 'Singapore', 36: 'South Korea', 38: 'Thailand', 39: 'SRI or TURK',
    40: 'United Arab Emirates', 41: 'USSR', 42: 'Australia', 43: 'New Zealand',
    44: 'Iceland', 45: 'Bahamas', 46: 'Barbados', 47: 'Bermuda',
    48: 'Dominican Republic', 49: 'Jamaica', 50: 'Nether. Antilles',
    51: 'Trin & Tabago', 52: 'Austria', 53: 'Belgium', 54: 'Czechos',
    55: 'Denmark', 56: 'Finland', 57: 'France', 58: 'Greece', 59: 'Hungary',
    60: 'Italy', 61: 'Luxembourg', 62: 'Netherlands', 63: 'Norway', 64: 'Poland',
    65: 'Portugal', 66: 'Romania', 67: 'Spain', 68: 'Sweden', 69: 'Switzerland',
    70: 'West Germany', 71: 'Yugoslavia', 72: 'Cyprus', 73: 'Malta',
    74: 'Indonesia', 75: 'Papua New Guinea', 76: 'Philippines',
    77: 'Channel Islands', 78: 'England', 79: 'Ireland',
    80: 'Northern Ireland', 81: 'Scotland', 82: 'Wales', 83: 'Canada', 84: 'USA'
}
df['COUNTRY'] = df['COUNTRY'].replace(mapping2)
print(df['COUNTRY'])

Respondents
1              USA
2              USA
3              USA
4              USA
5              USA
            ...   
1421058    England
1421059    Bahamas
1421060     France
1421061    England
1421062    England
Name: COUNTRY, Length: 1421062, dtype: object


sex_mapping = {0: 'NR', 1: 'Male', 2: 'Female'}
df['SEX'] = df['SEX'].replace(sex_mapping)
df['SEX'].head()

Respondents
1    Female
2    Female
3    Female
4      Male
5    Female
Name: SEX, dtype: object


pregnant_mapping = {0: 'No', 1: 'Yes'}
df['PREGNANT'] = df['PREGNANT'].replace(pregnant_mapping)
df['PREGNANT'].head()

Respondents
1    No
2    No
3    No
4    No
5    No
Name: PREGNANT, dtype: object


ethnic_mapping = {
    0: 'No response',
    1: 'Black',
    2: 'White',
    3: 'Asian',
    4: 'American Indian',
    5: 'Hispanic',
    6: 'Other',
    7: 'Prefer not to answer'
}
df['ETHNIC'] = df['ETHNIC'].replace(ethnic_mapping)
df['ETHNIC'].head()

Respondents
1    White
2    White
3    White
4    White
5    White
Name: ETHNIC, dtype: object


smell_mapping = {0: 'No response', 1: 'Yes', 2: 'No'}
smellcolumns_to_map = ['AND_SMELL', 'AMY_SMELL', 'GALAX_SMELL', 'EUG_SMELL', 'MERCAP_SMELL', 'ROSE_SMELL']
df[smellcolumns_to_map] = df[smellcolumns_to_map].replace(smell_mapping)
df[smellcolumns_to_map].head()


mem_mapping = {0: 'No response', 1: 'Yes', 2: 'No'}
memcolumns_to_map = ['AND_MEM', 'AA_MEM', 'GAL_MEM', 'EUG_MEM', 'MER_MEM', 'ROSE_MEM']
df[memcolumns_to_map] = df[memcolumns_to_map].replace(mem_mapping)
df[memcolumns_to_map].head()


eat_wear_mapping = {0: 'No response', 1: 'Yes', 2: 'No', 3: 'No opinion'}
eat_wear_columns_to_map = ['AND_EAT', 'AA_EAT', 'GAL_EAT', 'EUG_EAT', 'MER_EAT', 'ROSE_EAT',
                   'AND_WEAR', 'AA_WEAR', 'GAL_WEAR', 'EUG_WEAR', 'MER_WEAR', 'ROSE_WEAR']
df[eat_wear_columns_to_map] = df[eat_wear_columns_to_map].replace(eat_wear_mapping)
print(df[eat_wear_columns_to_map])

                 AND_EAT AA_EAT      GAL_EAT     EUG_EAT      MER_EAT  \
Respondents                                                             
1                     No    Yes           No         Yes           No   
2                     No    Yes           No         Yes  No response   
3                     No    Yes           No         Yes           No   
4                     No     No  No response         Yes           No   
5            No response     No           No          No           No   
...                  ...    ...          ...         ...          ...   
1421058               No    Yes           No         Yes           No   
1421059               No     No           No         Yes           No   
1421060               No    Yes           No          No           No   
1421061               No    Yes   No opinion  No opinion           No   
1421062              Yes    Yes          Yes          No           No   

            ROSE_EAT     AND_WEAR      AA_WEAR     GAL_WEAR     EUG_WEAR  \
Respondents                                                                
1                 No           No          Yes           No   No opinion   
2                 No          Yes           No          Yes           No   
3                 No          Yes  No response   No opinion  No response   
4                 No   No opinion           No  No response           No   
5                 No  No response           No           No           No   
...              ...          ...          ...          ...          ...   
1421058           No           No           No          Yes           No   
1421059           No           No          Yes          Yes           No   
1421060           No           No           No          Yes          Yes   
1421061           No           No           No           No           No   
1421062          Yes          Yes   No opinion          Yes           No   

                MER_WEAR ROSE_WEAR  
Respondents                         
1                     No       Yes  
2            No response       Yes  
3                     No       Yes  
4                     No        No  
5                     No        No  
...                  ...       ...  
1421058               No       Yes  
1421059               No       Yes  
1421060               No       Yes  
1421061               No        No  
1421062               No       Yes  

[1421062 rows x 12 columns]


des_mapping = {
    0: 'No response',
    1: 'No odor',
    2: 'Floral',
    3: 'Musky',
    4: 'Urine',
    5: 'Foul',
    6: 'Ink',
    7: 'Spicy',
    8: 'Woody',
    9: 'Fruity',
    10: 'Burnt',
    11: 'Sweet',
    12: 'Other'
}
des_columns_to_map = ['AND_DES', 'AA_DES', 'GAL_DES', 'EUG_DES', 'MER_DES', 'ROSE_DES']
df[des_columns_to_map] = df[des_columns_to_map].replace(des_mapping)
df[des_columns_to_map].head()


hand_mapping = {0: 'No response', 1: 'Left', 2: 'Ambidextrous', 3: 'Right'}
df['hand'] = df['hand'].replace(hand_mapping)
df['hand'].head()

Respondents
1    Right
2    Right
3    Right
4    Right
5     Left
Name: hand, dtype: object


decage_mapping = {0: 'No response', 1: 'Teens', 2: 'Twenties', 3: 'Thirties', 4: 'Forties', 5: 'Fifties', 6: 'Sixties', 7: 'Seventies', 8: 'Eighties', 9: 'Nineties'}
df['decage'] = df['decage'].replace(decage_mapping)
print(df['decage'])

Respondents
1          Thirties
2           Forties
3          Eighties
4          Twenties
5           Sixties
             ...   
1421058    Twenties
1421059    Thirties
1421060     Sixties
1421061    Twenties
1421062     Sixties
Name: decage, Length: 1421062, dtype: object


corr_mapping = {0: 'No response', 1: 'Correct', 2: 'Incorrect'}
corr_columns_to_map = ['corr1', 'corr2', 'corr3', 'corr4', 'corr5', 'corr6']
df[corr_columns_to_map] = df[corr_columns_to_map].replace(corr_mapping)
df[corr_columns_to_map].head()


import matplotlib.pyplot as plt
import numpy as np

# Remove 'NR' from selected_vars
selected_vars = ['AND_QUAL', 'AA_QUAL', 'GAL_QUAL', 'EUG_QUAL', 'MERCAP_QUAL', 'ROSE_QUAL']

# Exclude 'NR' from grouped_data
grouped_data = df[df['SEX'] != 'NR'].groupby('SEX')[selected_vars]

# Calculate means for each odor variable
means = grouped_data.mean()

# Create a bar plot without error bars
fig, ax = plt.subplots(figsize=(12, 8))
x = np.arange(len(selected_vars))
width = 0.35

for i, (label, row) in enumerate(means.iterrows()):
    ax.bar(x + i * width, row, width, label=label)

ax.set_title('Quality of Odor Responses by Gender')
ax.set_xlabel('Odor Variable')
ax.set_ylabel('Mean Smell Perception')
ax.set_xticks(x + width * (len(means) - 1) / 2)

# Set the custom x-axis labels
custom_labels = ['Androstenone', 'Amyl Acetate', 'Galaxolide', 'Eugenol', 'Mercaptans', 'Rose']
ax.set_xticklabels(custom_labels, rotation=0)

ax.legend(title='Gender')

plt.show()


url = "https://raw.githubusercontent.com/jakobschanzer/Smells/master/smoking.csv"
df1 = pd.read_csv(url)
df1.head()


df_1987 = df1.loc[df1['Year']==1987]
df_1987.head()


df.rename(columns={'COUNTRY': 'Country'}, inplace=True)
df['Country'].head()

Respondents
1    USA
2    USA
3    USA
4    USA
5    USA
Name: Country, dtype: object


pd.set_option('display.max_rows',None)
df_1987.head()


british = ['England', 'Scotland', 'Northern Ireland' ,'Wales', 'Channel Islands']
for british_country in british:
    df['Country']= df['Country'].replace(british_country, 'United Kingdom')
Czechos = ['Czech Republic', 'Slovakia']
for czech in Czechos:
    df_1987['Country'] = df_1987['Country'].replace(Czechos, 'Czechos')
df['Country'].head()

/var/folders/w2/g42kbvzd74v8055fqnq1mj5h0000gn/T/ipykernel_59432/2067893382.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1987['Country'] = df_1987['Country'].replace(Czechos, 'Czechos')

Respondents
1    USA
2    USA
3    USA
4    USA
5    USA
Name: Country, dtype: object


yugo = ['Bosnia and Herzegovina', 'Croatia', 'Kosovo', 'Macedonia', 'Montenegro', 'Serbia', 'Slovenia']
for yu in yugo:
    df_1987['Country'] = df_1987['Country'].replace(yugo, 'Yugoslavia')
df_1987.head()

/var/folders/w2/g42kbvzd74v8055fqnq1mj5h0000gn/T/ipykernel_59432/3778819076.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1987['Country'] = df_1987['Country'].replace(yugo, 'Yugoslavia')


def custom_aggregation(group):
    result = pd.Series(dtype='float64')  
    result['Year'] = group['Year'].max()
    result['Data.Daily cigarettes'] = group['Data.Daily cigarettes'].sum()
    result['Data.Percentage.Male'] = group['Data.Percentage.Male'].mean()
    result['Data.Percentage.Female'] = group['Data.Percentage.Female'].mean()
    result['Data.Percentage.Total'] = group['Data.Percentage.Total'].mean()
    result['Data.Smokers.Total'] = group['Data.Smokers.Total'].sum()
    result['Data.Smokers.Female'] = group['Data.Smokers.Female'].sum()
    result['Data.Smokers.Male'] = group['Data.Smokers.Male'].sum()
    return result

grouped_df_1987 = df_1987.groupby('Country').apply(custom_aggregation).reset_index()
grouped_df_1987.head()


merged = pd.merge(df, grouped_df_1987, on = 'Country', how='inner')
merged.head()


columns_to_include = ['Country', 'totcorr'] + list(grouped_df_1987.columns.difference(['Country']))
new_df = merged[columns_to_include].copy()
new_df.head()


country_stats = new_df.groupby('Country')['totcorr'].agg(['mean', 'std']).reset_index()
final_df = new_df.drop_duplicates(subset='Country')
final_df = pd.merge(final_df, country_stats, on='Country', how='left')
final= final_df.drop('totcorr', axis=1)
final = final.rename(columns={'mean': "Average Correct by Country"})
display(final)


selected_columns = ['Country', 'Data.Daily cigarettes', 'Data.Percentage.Total', 'Data.Smokers.Total', 'Average Correct by Country', 'std']
selected = final[selected_columns]
selected.head()


selected = selected.rename(columns={
    'Data.Daily cigarettes': 'Average Daily Cigarettes',
    'Data.Smokers.Total': 'Total Smokers',
    'Data.Percentage.Total': 'Percent Who Smoke'
})

selected.head()


correlation = selected['Average Correct by Country'].corr(selected['Average Daily Cigarettes'])
print("Correlation between Average Correct by Country and Average Daily Cigarettes:", correlation)


import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.scatterplot(x='Average Daily Cigarettes', y='Average Correct by Country', data=selected)
plt.title('Scatter Plot: Average Daily Cigarettes vs. Average Correct by Country')
plt.xlabel('Average Daily Cigarettes')
plt.ylabel('Average Correct by Country')
plt.show()

Correlation between Average Correct by Country and Average Daily Cigarettes: 0.06642243132539802


correlation_percent_smoke = selected['Average Correct by Country'].corr(selected['Percent Who Smoke'])
print("Correlation between Average Correct by Country and Percent Who Smoke:", correlation_percent_smoke)

plt.figure(figsize=(8, 6))
sns.scatterplot(x='Percent Who Smoke', y='Average Correct by Country', data=selected)
plt.title('Scatter Plot: Percent Who Smoke vs. Average Correct by Country')
plt.xlabel('Percent Who Smoke')
plt.ylabel('Average Correct by Country')
plt.show()

Correlation between Average Correct by Country and Percent Who Smoke: 0.13989192537550763


#Pairplot
sns.pairplot(selected[['Average Correct by Country', 'Percent Who Smoke', 'Average Daily Cigarettes']])
plt.suptitle('Pairplot: Average Correct by Country, Percent Who Smoke, Average Daily Cigarettes', y=1.02)
plt.show()


# Jointplot
sns.jointplot(x='Percent Who Smoke', y='Average Correct by Country', data=selected, kind='reg')
plt.suptitle('Jointplot: Percent Who Smoke vs. Average Correct by Country', y=1.02)
plt.show()


new_df['SEX']=df['SEX']
new_df.head()


df_male = new_df[new_df['SEX'] == 'Male'].copy()
df_female = new_df[new_df['SEX'] == 'Female'].copy()

df_male = df_male.dropna(subset=['SEX'])
df_female = df_female.dropna(subset=['SEX'])

df_male = df_male.drop(['Data.Smokers.Female',  'Data.Smokers.Total', 'Data.Daily cigarettes', 'Data.Percentage.Total', 'Data.Percentage.Female', 'Year'], axis=1)
df_female = df_female.drop(['Data.Percentage.Male', 'Data.Smokers.Male', 'Data.Smokers.Total','Data.Daily cigarettes', 'Data.Percentage.Total', 'Year'], axis=1)

df_male.head(), df_female.head()

(   Country  totcorr  Data.Percentage.Male  Data.Smokers.Male   SEX
 4   Canada        3                  32.0          3453267.0  Male
 7   Canada        2                  32.0          3453267.0  Male
 9   Canada        4                  32.0          3453267.0  Male
 14  Canada        4                  32.0          3453267.0  Male
 17  Canada        3                  32.0          3453267.0  Male,
   Country  totcorr  Data.Percentage.Female  Data.Smokers.Female     SEX
 1  Canada        5               33.599998            3236307.0  Female
 2  Canada        4               33.599998            3236307.0  Female
 3  Canada        4               33.599998            3236307.0  Female
 5  Canada        5               33.599998            3236307.0  Female
 6  Canada        5               33.599998            3236307.0  Female)


df_male = df_male.rename(columns={
    'totcorr': 'Total Correctly Guessed',
    'Data.Percentage.Male': 'Percent of Men that Smoke',
    'Data.Smokers.Male': 'Total Male Smokers'
})

df_female = df_female.rename(columns={
    'totcorr': 'Total Correctly Guessed',
    'Data.Percentage.Female': 'Percent of Women that Smoke',
    'Data.Smokers.Female': 'Total Female Smokers'
})
df_male.head(), df_female.head()

(   Country  Total Correctly Guessed  Percent of Men that Smoke  \
 4   Canada                        3                       32.0   
 7   Canada                        2                       32.0   
 9   Canada                        4                       32.0   
 14  Canada                        4                       32.0   
 17  Canada                        3                       32.0   
 
     Total Male Smokers   SEX  
 4            3453267.0  Male  
 7            3453267.0  Male  
 9            3453267.0  Male  
 14           3453267.0  Male  
 17           3453267.0  Male  ,
   Country  Total Correctly Guessed  Percent of Women that Smoke  \
 1  Canada                        5                    33.599998   
 2  Canada                        4                    33.599998   
 3  Canada                        4                    33.599998   
 5  Canada                        5                    33.599998   
 6  Canada                        5                    33.599998   
 
    Total Female Smokers     SEX  
 1             3236307.0  Female  
 2             3236307.0  Female  
 3             3236307.0  Female  
 5             3236307.0  Female  
 6             3236307.0  Female  )


mean_total_correct_male = df_male['Total Correctly Guessed'].mean()
std_total_correct_male = df_male['Total Correctly Guessed'].std()

mean_total_correct_female = df_female['Total Correctly Guessed'].mean()
std_total_correct_female = df_female['Total Correctly Guessed'].std()

mean_total_correct_male, std_total_correct_male, mean_total_correct_female, std_total_correct_female

(3.1549752538147846, 1.1821878409311295, 3.1562444822362856, 1.178658293463597)


grouped_mean_std_male = df_male.groupby('Country')['Total Correctly Guessed'].agg(['mean', 'std'])

grouped_mean_std_female = df_female.groupby('Country')['Total Correctly Guessed'].agg(['mean', 'std'])

grouped_mean_std_male.head(), grouped_mean_std_female.head()

(               mean       std
 Country                      
 Argentina  2.632075  1.125537
 Australia  3.037796  1.201065
 Austria    3.125874  1.186870
 Bahamas    3.297297  1.131489
 Bahrain    2.960000  1.098484,
                mean       std
 Country                      
 Argentina  2.584314  1.257947
 Australia  3.034619  1.205273
 Austria    3.162234  1.183333
 Bahamas    3.052174  1.190927
 Bahrain    3.063830  1.435637)


side_by_side_table = pd.concat([grouped_mean_std_male.add_suffix('_Male'), grouped_mean_std_female.add_suffix('_Female')], axis=1)

side_by_side_table.head()


from scipy.stats import ttest_ind

statistically_different_countries = []
counts_per_country = {}

for country in grouped_mean_std_male.index:
    mean_male = grouped_mean_std_male.loc[country, 'mean']
    std_male = grouped_mean_std_male.loc[country, 'std']
    
    mean_female = grouped_mean_std_female.loc[country, 'mean']
    std_female = grouped_mean_std_female.loc[country, 'std']

    t_stat, p_value = ttest_ind(df_male[df_male['Country'] == country]['Total Correctly Guessed'],
                                df_female[df_female['Country'] == country]['Total Correctly Guessed'],
                                equal_var=False)

    if p_value < 0.05:
        statistically_different_countries.append(country)
        count_male = len(df_male[df_male['Country'] == country])
        count_female = len(df_female[df_female['Country'] == country])
        counts_per_country[country] = {'Male': count_male, 'Female': count_female}


print("Statistically Different Countries:")
for country in statistically_different_countries:
    print(f"Country: {country}")
    print(f" - Male Respondents: {counts_per_country[country]['Male']}")
    print(f" - Female Respondents: {counts_per_country[country]['Female']}")

Statistically Different Countries:
Country: India
 - Male Respondents: 150
 - Female Respondents: 139
Country: Iran
 - Male Respondents: 83
 - Female Respondents: 123


smoke_table = df[['SMOKE', 'Country', 'totcorr']].rename(columns={'totcorr': 'Total Correctly Guessed'})
smoke_table.head()


yes_smokers_table = smoke_table[smoke_table['SMOKE'] == 'YES']

no_smokers_table = smoke_table[smoke_table['SMOKE'] == 'NO']

yes_smokers_table.head(), no_smokers_table.head()

(            SMOKE Country  Total Correctly Guessed
 Respondents                                       
 4             YES     USA                        3
 12            YES     USA                        2
 20            YES     USA                        2
 36            YES     USA                        2
 65            YES     USA                        2,
             SMOKE Country  Total Correctly Guessed
 Respondents                                       
 1              NO     USA                        4
 2              NO     USA                        4
 3              NO     USA                        4
 5              NO     USA                        0
 6              NO     USA                        3)


grouped_mean_std_yes = yes_smokers_table.groupby('Country')['Total Correctly Guessed'].agg(['mean', 'std']).reset_index()
grouped_mean_std_no = no_smokers_table.groupby('Country')['Total Correctly Guessed'].agg(['mean', 'std']).reset_index()
grouped_mean_std_yes.head(), grouped_mean_std_no.head()

(     Country      mean       std
 0  Argentina  2.640000  1.185093
 1  Australia  2.995445  1.247632
 2    Austria  3.230263  1.215084
 3    Bahamas  3.090909  1.254169
 4    Bahrain  3.000000  1.154701,
      Country      mean       std
 0  Argentina  2.589873  1.193850
 1  Australia  3.043067  1.195196
 2    Austria  3.124272  1.175336
 3    Bahamas  3.159722  1.150635
 4    Bahrain  3.116279  1.331117)


from scipy.stats import ttest_ind

significant_difference_countries = []

for country in yes_smokers_table['Country'].unique():
    total_correct_yes = yes_smokers_table[yes_smokers_table['Country'] == country]['Total Correctly Guessed']
    total_correct_no = no_smokers_table[no_smokers_table['Country'] == country]['Total Correctly Guessed']

    t_stat, p_value = ttest_ind(total_correct_yes, total_correct_no, equal_var=False)

    if p_value < 0.10 and total_correct_yes.mean() < total_correct_no.mean():
        significant_difference_countries.append(country)

significant_difference_countries

['Australia', 'Kenya']


fig, ax = plt.subplots(figsize=(10, 6))
df.groupby('SMOKE')['totcorr'].mean().plot(kind='bar', color=['skyblue', 'salmon'])

plt.title('Mean Total Correctly Guessed by Smoking Status')
plt.xlabel('SMOKE')
plt.ylabel('Mean Total Correctly Guessed')

plt.show()


selected_columns = [ 'SEX','SMOKE', 'Country','AND_QUAL', 'AA_QUAL', 'GAL_QUAL', 'EUG_QUAL', 'MERCAP_QUAL', 'ROSE_QUAL']
df_selected = df[selected_columns]

print(df_selected.head())

                SEX SMOKE Country  AND_QUAL  AA_QUAL  GAL_QUAL  EUG_QUAL  \
Respondents                                                                
1            Female    NO     USA         2        4         4         5   
2            Female    NO     USA         3        4         4         4   
3            Female    NO     USA         5        5         3         5   
4              Male   YES     USA         3        3         0         5   
5            Female    NO     USA         0        0         4         4   

             MERCAP_QUAL  ROSE_QUAL  
Respondents                          
1                      1          4  
2                      0          5  
3                      2          5  
4                      1          2  
5                      3          3


from scipy.stats import ttest_ind

significant_difference_countries = []

for country in yes_smokers_table['Country'].unique():
    total_correct_yes = yes_smokers_table[yes_smokers_table['Country'] == country]['Total Correctly Guessed']
    total_correct_no = no_smokers_table[no_smokers_table['Country'] == country]['Total Correctly Guessed']

    t_stat, p_value = ttest_ind(total_correct_yes, total_correct_no, equal_var=False)

    if p_value < 0.10 and total_correct_yes.mean() < total_correct_no.mean():
        significant_difference_countries.append(country)

counts_per_country = {}
for country in significant_difference_countries:
    count_yes = len(yes_smokers_table[yes_smokers_table['Country'] == country])
    count_no = len(no_smokers_table[no_smokers_table['Country'] == country])
    counts_per_country[country] = {'Smokers (Yes)': count_yes, 'Non-Smokers (No)': count_no}

print("Counts of Respondents for Significant Countries:")
for country, counts in counts_per_country.items():
    print(f"Country: {country}")
    print(f" - Smokers (Yes): {counts['Smokers (Yes)']}")
    print(f" - Non-Smokers (No): {counts['Non-Smokers (No)']}")

Counts of Respondents for Significant Countries:
Country: Australia
 - Smokers (Yes): 2854
 - Non-Smokers (No): 17438
Country: Kenya
 - Smokers (Yes): 20
 - Non-Smokers (No): 62


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

selected_columns = ['SEX', 'SMOKE', 'Country', 'AND_QUAL', 'AA_QUAL', 'GAL_QUAL', 'EUG_QUAL', 'MERCAP_QUAL', 'ROSE_QUAL']
df_selected = df[selected_columns].copy()

df_selected['SMOKE_BINARY'] = df_selected['SMOKE'].map({'NO': 0, 'YES': 1})
df_selected['SEX_BINARY'] = df_selected['SEX'].map({'MALE': 0, 'FEMALE': 1})
df_selected.drop(['SMOKE', 'SEX'], axis=1, inplace=True)

target_variables = ['AND_QUAL', 'AA_QUAL', 'GAL_QUAL', 'EUG_QUAL', 'MERCAP_QUAL', 'ROSE_QUAL']
models = {}
results = {}

for target_var in target_variables:
    X = df_selected[['SMOKE_BINARY', 'SEX_BINARY', 'Country']]
    y = df_selected[target_var]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    numeric_features = ['SMOKE_BINARY', 'SEX_BINARY']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    categorical_features = ['Country']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(sparse=False))
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    logistic_model = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('classifier', LogisticRegression(random_state=42, max_iter=1000))])
    logistic_model.fit(X_train, y_train)
    y_pred = logistic_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, zero_division=0)
    models[target_var] = logistic_model
    results[target_var] = {
        'Accuracy': accuracy,
        'Classification Report': classification_rep
    }

for target_var, result in results.items():
    print(f"Results for {target_var}:")
    print(f"Accuracy: {result['Accuracy']}")
    print("Classification Report:")
    print(result['Classification Report'])

Results for AND_QUAL:
Accuracy: 0.27703869984835316
Classification Report:
              precision    recall  f1-score   support

           0       0.28      0.96      0.43     77979
           1       0.00      0.00      0.00     19863
           2       0.00      0.00      0.00     37735
           3       0.26      0.04      0.07     67414
           4       0.27      0.01      0.03     62872
           5       0.00      0.00      0.00     18350

    accuracy                           0.28    284213
   macro avg       0.14      0.17      0.09    284213
weighted avg       0.20      0.28      0.14    284213

Results for AA_QUAL:
Accuracy: 0.4663192746285356
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5478
           1       0.00      0.00      0.00      5657
           2       0.00      0.00      0.00     19853
           3       0.00      0.00      0.00     60402
           4       0.47      1.00      0.64    132537
           5       0.33      0.00      0.00     60286

    accuracy                           0.47    284213
   macro avg       0.13      0.17      0.11    284213
weighted avg       0.29      0.47      0.30    284213

Results for GAL_QUAL:
Accuracy: 0.2992157290482842
Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.82      0.44     83367
           1       0.00      0.00      0.00      9053
           2       0.00      0.00      0.00     20830
           3       0.26      0.00      0.00     67574
           4       0.28      0.23      0.25     71446
           5       0.00      0.00      0.00     31943

    accuracy                           0.30    284213
   macro avg       0.14      0.18      0.12    284213
weighted avg       0.22      0.30      0.19    284213

Results for EUG_QUAL:
Accuracy: 0.41255326111050517
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5566
           1       0.00      0.00      0.00      7004
           2       0.00      0.00      0.00     16078
           3       0.27      0.00      0.01     43263
           4       0.41      1.00      0.58    117259
           5       0.57      0.00      0.00     95043

    accuracy                           0.41    284213
   macro avg       0.21      0.17      0.10    284213
weighted avg       0.40      0.41      0.24    284213

Results for MERCAP_QUAL:
Accuracy: 0.5923198446235746
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     10344
           1       0.59      1.00      0.74    168347
           2       0.33      0.00      0.00     71492
           3       0.00      0.00      0.00     23981
           4       0.00      0.00      0.00      4320
           5       0.00      0.00      0.00      5729

    accuracy                           0.59    284213
   macro avg       0.15      0.17      0.12    284213
weighted avg       0.43      0.59      0.44    284213

Results for ROSE_QUAL:
Accuracy: 0.4965641965708817
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4383
           1       0.00      0.00      0.00      3361
           2       0.00      0.00      0.00      9248
           3       0.00      0.00      0.00     32495
           4       0.00      0.00      0.00     93596
           5       0.50      1.00      0.66    141130

    accuracy                           0.50    284213
   macro avg       0.08      0.17      0.11    284213
weighted avg       0.25      0.50      0.33    284213


import matplotlib.pyplot as plt
from sklearn.metrics import classification_report


target_variables = ['AND_QUAL', 'AA_QUAL', 'GAL_QUAL', 'EUG_QUAL', 'MERCAP_QUAL', 'ROSE_QUAL']

for target_var in target_variables:
    
    classification_report_str = results[target_var]['Classification Report']

    class_report_lines = classification_report_str.split('\n')

    precision = {}
    recall = {}
    f1_score = {}

    for line in class_report_lines[2:-5]:  
        parts = line.split()
        label = parts[0]
        precision[label] = float(parts[1])
        recall[label] = float(parts[2])
        f1_score[label] = float(parts[3])


    class_labels = list(precision.keys())

    plt.figure(figsize=(10, 6))
    plt.bar(class_labels, [precision[label] for label in class_labels], color='blue', alpha=0.7, label='Precision')
    plt.xlabel('Class Labels')
    plt.ylabel('Precision')
    plt.title(f'Precision by Class for {target_var}')
    plt.legend()
    plt.show()

    plt.figure(figsize=(10, 6))
    plt.bar(class_labels, [recall[label] for label in class_labels], color='green', alpha=0.7, label='Recall')
    plt.xlabel('Class Labels')
    plt.ylabel('Recall')
    plt.title(f'Recall by Class for {target_var}')
    plt.legend()
    plt.show()

    plt.figure(figsize=(10, 6))
    plt.bar(class_labels, [f1_score[label] for label in class_labels], color='orange', alpha=0.7, label='F1-Score')
    plt.xlabel('Class Labels')
    plt.ylabel('F1-Score')
    plt.title(f'F1-Score by Class for {target_var}')
    plt.legend()
    plt.show()


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

target_variables = ['AND_QUAL', 'AA_QUAL', 'GAL_QUAL', 'EUG_QUAL', 'MERCAP_QUAL', 'ROSE_QUAL']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.subplots_adjust(hspace=0.5)
fig.suptitle('Confusion Matrices for Target Variables', fontsize=16)

for i, target_var in enumerate(target_variables):
    y_pred = models[target_var].predict(X_test)
    conf_matrix = confusion_matrix(y_test, y_pred)
    confusion_df = pd.DataFrame(conf_matrix, columns=[f'Predicted {j}' for j in range(6)],
                                index=[f'Actual {j}' for j in range(6)])
    row = i // 3
    col = i % 3
    ax = axes[row, col]
    sns.heatmap(confusion_df, annot=True, fmt='d', cmap='Blues', linewidths=.5, cbar=False, ax=ax)
    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')
    ax.set_title(f'Confusion Matrix for {target_var}')

plt.show()


import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score

target_variables = ['AND_QUAL', 'AA_QUAL', 'GAL_QUAL', 'EUG_QUAL', 'MERCAP_QUAL', 'ROSE_QUAL']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.subplots_adjust(hspace=0.5)
fig.suptitle('Class-wise ROC Curves for Target Variables', fontsize=16)

for i, target_var in enumerate(target_variables):
    y_true = df_selected[target_var]
    y_score = models[target_var].predict_proba(X)
    
    lb = LabelBinarizer()
    y_true_bin = lb.fit_transform(y_true)
    
    fpr = {}
    tpr = {}
    roc_auc = {}
    
    for class_index in range(y_true_bin.shape[1]):
        fpr[class_index], tpr[class_index], _ = roc_curve(y_true_bin[:, class_index], y_score[:, class_index])
        roc_auc[class_index] = auc(fpr[class_index], tpr[class_index])
    
    row = i // 3
    col = i % 3
    ax = axes[row, col]
    
    for class_index in range(y_true_bin.shape[1]):
        ax.plot(fpr[class_index], tpr[class_index], lw=2, label=f'Class {class_index} (area = {roc_auc[class_index]:.2f})')
    
    ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(f'Class-wise ROC Curves for {target_var}')
    ax.legend(loc='lower right')

plt.show()


import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import LabelBinarizer

target_variables = ['AND_QUAL', 'AA_QUAL', 'GAL_QUAL', 'EUG_QUAL', 'MERCAP_QUAL', 'ROSE_QUAL']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.subplots_adjust(hspace=0.5)
fig.suptitle('Precision-Recall Curves for Target Variables', fontsize=16)

for i, target_var in enumerate(target_variables):
    y_true = df_selected[target_var]
    y_score = models[target_var].predict_proba(X)
    
    lb = LabelBinarizer()
    y_true_bin = lb.fit_transform(y_true)
    
    precision = {}
    recall = {}
    thresholds = {}
    
    for class_index in range(y_true_bin.shape[1]):
        precision[class_index], recall[class_index], thresholds[class_index] = precision_recall_curve(y_true_bin[:, class_index], y_score[:, class_index])
    
    row = i // 3
    col = i % 3
    ax = axes[row, col]
    
    for class_index in range(y_true_bin.shape[1]):
        ax.plot(recall[class_index], precision[class_index], lw=2, label=f'Class {class_index}')
    
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_title(f'Precision-Recall Curve for {target_var}')
    ax.legend(loc='lower left')

plt.show()


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
feature_importance = {}

for target_var in target_variables:
    logistic_model = models[target_var]
    coefficients = logistic_model.named_steps['classifier'].coef_
    
    
    categorical_encoder = logistic_model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
    feature_names = numeric_features + list(categorical_encoder.get_feature_names(input_features=categorical_features))

    if len(coefficients[0]) != len(feature_names):
        feature_names = feature_names[:len(coefficients[0])]
    
    class_importance = np.abs(coefficients)
    
    class_importance_df = pd.DataFrame(class_importance, columns=feature_names)
    class_importance_df = class_importance_df.T
    
    feature_importance[target_var] = class_importance_df

for target_var, importance_df in feature_importance.items():
    plt.figure(figsize=(12, 6))
    plt.title(f'Feature Importance for {target_var}')
    sns.barplot(data=importance_df, orient='h', palette='viridis')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.show()


from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

predicted_probabilities = logistic_model.predict_proba(X_test)  
true_labels = y_test  
plt.figure(figsize=(8, 6))

for class_index in range(predicted_probabilities.shape[1]):
    prob_true, prob_pred = calibration_curve(true_labels == class_index, predicted_probabilities[:, class_index], n_bins=10)
    plt.plot(prob_pred, prob_true, marker='o', linestyle='--', label=f'Class {class_index}')

plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfectly Calibrated')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Curves for Each Class')
plt.legend()
plt.grid(False)
plt.show()


import numpy as np
import matplotlib.pyplot as plt


predicted_probabilities = logistic_model.predict_proba(X_test)  
true_labels = y_test  
predicted_classes = np.argmax(predicted_probabilities, axis=1)
predicted_probability_for_true_class = predicted_probabilities[np.arange(len(true_labels)), true_labels]

plt.figure(figsize=(10, 6))
plt.hist(predicted_probability_for_true_class, bins=20, alpha=0.5, label='Correct Class', color='blue')
plt.hist(np.max(predicted_probabilities, axis=1), bins=20, alpha=0.5, label='All Classes', color='red')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('Error Analysis Plot')
plt.legend()
plt.grid(False)
plt.show()

	Country	Year	Data.Daily cigarettes	Data.Percentage.Male	Data.Percentage.Female	Data.Percentage.Total	Data.Smokers.Total	Data.Smokers.Female	Data.Smokers.Male
0	Afghanistan	1980	5.7	10.4	18.4	2.4	733520	81707	651813
1	Afghanistan	1981	5.8	10.5	18.4	2.3	720102	79276	640826
2	Afghanistan	1982	5.8	10.5	18.5	2.3	700415	76061	624355
3	Afghanistan	1983	5.9	10.5	18.6	2.3	676984	72411	604572
4	Afghanistan	1984	6.0	10.6	18.6	2.3	653812	68908	584905

	Country	Year	Data.Daily cigarettes	Data.Percentage.Male	Data.Percentage.Female	Data.Percentage.Total	Data.Smokers.Total	Data.Smokers.Female	Data.Smokers.Male
7	Afghanistan	1987	6.1	10.700000	18.799999	2.2	608651	61716	546936
40	Albania	1987	17.0	21.299999	38.299999	3.3	448005	33774	414231
73	Algeria	1987	18.1	16.000000	31.200001	0.8	2125295	53944	2071351
106	Andorra	1987	19.5	31.200001	37.799999	25.1	11436	4791	6644
139	Angola	1987	23.5	8.900000	17.100000	1.1	448940	28647	420293

	Country	Year	Data.Daily cigarettes	Data.Percentage.Male	Data.Percentage.Female	Data.Percentage.Total	Data.Smokers.Total	Data.Smokers.Female	Data.Smokers.Male
7	Afghanistan	1987	6.1	10.700000	18.799999	2.2	608651	61716	546936
40	Albania	1987	17.0	21.299999	38.299999	3.3	448005	33774	414231
73	Algeria	1987	18.1	16.000000	31.200001	0.8	2125295	53944	2071351
106	Andorra	1987	19.5	31.200001	37.799999	25.1	11436	4791	6644
139	Angola	1987	23.5	8.900000	17.100000	1.1	448940	28647	420293

	Country	Year	Data.Daily cigarettes	Data.Percentage.Male	Data.Percentage.Female	Data.Percentage.Total	Data.Smokers.Total	Data.Smokers.Female	Data.Smokers.Male
7	Afghanistan	1987	6.1	10.700000	18.799999	2.2	608651	61716	546936
40	Albania	1987	17.0	21.299999	38.299999	3.3	448005	33774	414231
73	Algeria	1987	18.1	16.000000	31.200001	0.8	2125295	53944	2071351
106	Andorra	1987	19.5	31.200001	37.799999	25.1	11436	4791	6644
139	Angola	1987	23.5	8.900000	17.100000	1.1	448940	28647	420293

	Country	Year	Data.Daily cigarettes	Data.Percentage.Male	Data.Percentage.Female	Data.Percentage.Total	Data.Smokers.Total	Data.Smokers.Female	Data.Smokers.Male
0	Afghanistan	1987.0	6.1	10.700000	18.799999	2.2	608651.0	61716.0	546936.0
1	Albania	1987.0	17.0	21.299999	38.299999	3.3	448005.0	33774.0	414231.0
2	Algeria	1987.0	18.1	16.000000	31.200001	0.8	2125295.0	53944.0	2071351.0
3	Andorra	1987.0	19.5	31.200001	37.799999	25.1	11436.0	4791.0	6644.0
4	Angola	1987.0	23.5	8.900000	17.100000	1.1	448940.0	28647.0	420293.0

Jakob Schanzer¶

Data Science Final Project¶

National Geographic Smell Perception Survey¶

To access my webpage please CLICK HERE ¶

	Unnamed: 0	HAND_WRITE	HAND_THROW	WORK_ENVIOR	SELF_RATE_SMELL	DAYS_W_PERF	PERF_GT_1	FLU_COLD	...	decage	corr1	corr2	corr3	corr4	corr5	corr6	totcorr	source	a1a6_0
0	1	2	2	1	4	1	2	0	...	3	2	1	1	1	2	1	4	4	0
1	2	2	2	1	5	2	1	1	...	4	2	1	1	1	0	1	4	4	1
2	3	2	2	6	5	4	2	1	...	8	2	1	2	1	1	1	4	4	0
3	4	2	2	3	3	1	0	1	...	2	2	2	0	1	1	1	3	4	1
4	5	1	1	1	4	1	2	1	...	6	0	2	2	2	2	2	0	4	1

	Unnamed: 0	HAND_WRITE	HAND_THROW	WORK_ENVIOR	SELF_RATE_SMELL	DAYS_W_PERF	PERF_GT_1	FLU_COLD	...	decage	corr1	corr2	corr3	corr4	corr5	corr6	totcorr	source	a1a6_0
0	1	Right	2	1	4	1	2	0	...	3	2	1	1	1	2	1	4	4	0
1	2	Right	2	1	5	2	1	1	...	4	2	1	1	1	0	1	4	4	1
2	3	Right	2	6	5	4	2	1	...	8	2	1	2	1	1	1	4	4	0
3	4	Right	2	3	3	1	0	1	...	2	2	2	0	1	1	1	3	4	1
4	5	Left	1	1	4	1	2	1	...	6	0	2	2	2	2	2	0	4	1

	Unnamed: 0	HAND_WRITE	HAND_THROW	WORK_ENVIOR	SELF_RATE_SMELL	DAYS_W_PERF	PERF_GT_1	FLU_COLD	...	decage	corr1	corr2	corr3	corr4	corr5	corr6	totcorr	source	a1a6_0
0	1	Right	Right	Home	4	1	2	0	...	3	2	1	1	1	2	1	4	4	0
1	2	Right	Right	Home	5	2	1	1	...	4	2	1	1	1	0	1	4	4	1
2	3	Right	Right	No Job	5	4	2	1	...	8	2	1	2	1	1	1	4	4	0
3	4	Right	Right	Office	3	1	0	1	...	2	2	2	0	1	1	1	3	4	1
4	5	Left	Left	Home	4	1	2	1	...	6	0	2	2	2	2	2	0	4	1

	HAND_WRITE	HAND_THROW	WORK_ENVIOR	SELF_RATE_SMELL	DAYS_W_PERF	PERF_GT_1	CHEM_EXPOSE	PREG	FLU_COLD	HEAD_INJ	...	decage	corr1	corr2	corr3	corr4	corr5	corr6	totcorr	source	a1a6_0
Respondents
1	Right	Right	Home	4	1	2	0	0	0	0	...	3	2	1	1	1	2	1	4	4	0
2	Right	Right	Home	5	2	1	0	0	1	0	...	4	2	1	1	1	0	1	4	4	1
3	Right	Right	No Job	5	4	2	0	0	1	0	...	8	2	1	2	1	1	1	4	4	0
4	Right	Right	Office	3	1	0	0	0	1	0	...	2	2	2	0	1	1	1	3	4	1
5	Left	Left	Home	4	1	2	0	0	1	0	...	6	0	2	2	2	2	2	0	4	1

	AND_MEM	AA_MEM	GAL_MEM	EUG_MEM	MER_MEM	ROSE_MEM
Respondents
1	No	No	No	Yes	Yes	Yes
2	No	No	No	Yes	No response	Yes
3	No	No	No	No	No response	No
4	No	No	No response	Yes	Yes	Yes
5	No response	No	No	No	No	No

	AND_DES	AA_DES	GAL_DES	EUG_DES	MER_DES	ROSE_DES
Respondents
1	Spicy	Fruity	Musky	Spicy	Burnt	Floral
2	Woody	Fruity	Musky	Spicy	No response	Floral
3	Floral	Fruity	Sweet	Spicy	Foul	Floral
4	Floral	Other	No response	Spicy	Foul	Floral
5	No odor	Other	Other	No response	Other	Other

	corr1	corr2	corr3	corr4	corr5	corr6
Respondents
1	Incorrect	Correct	Correct	Correct	Incorrect	Correct
2	Incorrect	Correct	Correct	Correct	No response	Correct
3	Incorrect	Correct	Incorrect	Correct	Correct	Correct
4	Incorrect	Incorrect	No response	Correct	Correct	Correct
5	No response	Incorrect	Incorrect	Incorrect	Incorrect	Incorrect

	Country	totcorr	Data.Daily cigarettes	Data.Percentage.Female	Data.Percentage.Male	Data.Percentage.Total	Data.Smokers.Female	Data.Smokers.Male	Data.Smokers.Total	Year
0	Canada	4	22.799999	33.599998	32.0	30.5	3236307.0	3453267.0	6689574.0	1987.0
1	Canada	5	22.799999	33.599998	32.0	30.5	3236307.0	3453267.0	6689574.0	1987.0
2	Canada	4	22.799999	33.599998	32.0	30.5	3236307.0	3453267.0	6689574.0	1987.0
3	Canada	4	22.799999	33.599998	32.0	30.5	3236307.0	3453267.0	6689574.0	1987.0
4	Canada	3	22.799999	33.599998	32.0	30.5	3236307.0	3453267.0	6689574.0	1987.0

	Country	Data.Daily cigarettes	Data.Percentage.Total	Data.Smokers.Total	Average Correct by Country	std
0	Canada	22.799999	30.500000	6689574.0	3.280026	1.141738
1	Italy	19.299999	22.799999	13850692.0	2.958034	1.114232
2	New Zealand	19.600000	27.799999	706029.0	3.025372	1.169047
3	Australia	23.900000	24.299999	3321084.0	3.036484	1.202611
4	Netherlands	11.000000	24.000000	3285320.0	3.085908	1.216398

	mean_Male	std_Male	mean_Female	std_Female
Country
Argentina	2.632075	1.125537	2.584314	1.257947
Australia	3.037796	1.201065	3.034619	1.205273
Austria	3.125874	1.186870	3.162234	1.183333
Bahamas	3.297297	1.131489	3.052174	1.190927
Bahrain	2.960000	1.098484	3.063830	1.435637

	Country	Data.Daily cigarettes	Data.Percentage.Female	Data.Percentage.Male	Data.Percentage.Total	Data.Smokers.Female	Data.Smokers.Male	Data.Smokers.Total	Year	Average Correct by Country	std
0	Canada	22.799999	33.599998	32.000000	30.500000	3236307.0	3453267.0	6689574.0	1987.0	3.280026	1.141738
1	Italy	19.299999	37.400002	29.799999	22.799999	5512402.0	8338290.0	13850692.0	1987.0	2.958034	1.114232
2	New Zealand	19.600000	28.400000	28.100000	27.799999	356323.0	349706.0	706029.0	1987.0	3.025372	1.169047
3	Australia	23.900000	28.600000	26.400000	24.299999	1539836.0	1781248.0	3321084.0	1987.0	3.036484	1.202611
4	Netherlands	11.000000	31.200001	27.500000	24.000000	1458590.0	1826729.0	3285320.0	1987.0	3.085908	1.216398
5	Malta	37.099998	35.500000	28.600000	22.200001	30989.0	47051.0	78040.0	1987.0	2.973404	1.272639
6	Israel	21.600000	36.799999	31.000000	25.400000	366359.0	518478.0	884837.0	1987.0	2.617008	1.212430
7	Greece	26.400000	56.299999	39.799999	24.200001	989792.0	2175146.0	3164939.0	1987.0	2.987667	1.188877
8	Spain	21.600000	44.000000	34.299999	25.100000	3917318.0	6423273.0	10340591.0	1987.0	2.744792	1.098522
9	Brazil	35.200001	22.500000	18.200001	14.000000	6414858.0	9983413.0	16398271.0	1987.0	2.823615	1.237880
10	Venezuela	21.600000	27.500000	20.200001	12.900000	719071.0	1545890.0	2264961.0	1987.0	3.042216	1.222934
11	Poland	24.600000	50.799999	37.000000	24.200001	3523411.0	6829528.0	10352938.0	1987.0	2.907258	1.314613
12	Mexico	10.700000	38.000000	25.400000	13.300000	3266212.0	8947400.0	12213611.0	1987.0	2.908350	1.170077
13	Portugal	24.700001	36.200001	21.100000	7.600000	307957.0	1322551.0	1630508.0	1987.0	2.651969	1.167437
14	Costa Rica	29.799999	15.000000	10.300000	5.500000	49798.0	139362.0	189161.0	1987.0	2.844884	1.247129
15	France	17.200001	42.799999	33.500000	24.799999	5696888.0	9115945.0	14812833.0	1987.0	2.995956	1.184824
16	Austria	26.500000	35.099998	26.500000	18.900000	624944.0	1023319.0	1648263.0	1987.0	3.144776	1.188483
17	Indonesia	9.400000	54.000000	28.600000	3.600000	1923421.0	28013907.0	29937326.0	1987.0	2.702703	1.324207
18	Belgium	22.600000	37.599998	30.900000	24.600000	1027807.0	1467785.0	2495593.0	1987.0	2.882123	1.254200
19	Norway	5.200000	38.099998	34.500000	31.000000	533026.0	630663.0	1163689.0	1987.0	3.095383	1.201536
20	Iceland	22.799999	35.500000	34.200001	32.900002	30153.0	32408.0	62561.0	1987.0	3.033557	1.170678
21	Denmark	12.700000	36.200001	37.000000	37.799999	813841.0	746764.0	1560605.0	1987.0	3.179708	1.342231
22	Finland	15.800000	32.000000	24.299999	17.100000	355710.0	609506.0	965217.0	1987.0	3.071062	1.184411
23	Sweden	14.600000	25.799999	25.600000	25.500000	897907.0	869507.0	1767414.0	1987.0	3.202904	1.184194
24	Argentina	19.100000	29.799999	25.400000	21.299999	2366461.0	3116779.0	5483239.0	1987.0	2.594862	1.195658
25	Ecuador	15.800000	14.400000	8.700000	3.000000	84842.0	410604.0	495446.0	1987.0	2.990291	1.098083
26	United Kingdom	17.900000	32.299999	31.299999	30.400000	7274518.0	7140056.0	14414574.0	1987.0	3.054914	1.206809
27	Japan	25.200001	56.700001	33.500000	11.500000	5714533.0	26585993.0	32300525.0	1987.0	2.743017	1.228008
28	Thailand	15.000000	46.299999	24.900000	4.300000	791688.0	8245491.0	9037179.0	1987.0	2.917355	1.166096
29	Malaysia	20.700001	44.299999	23.100000	1.800000	92751.0	2301490.0	2394241.0	1987.0	2.508314	1.190459
30	Pakistan	17.900000	32.500000	19.600000	5.400000	1465186.0	9773248.0	11238433.0	1987.0	2.661972	1.133183
31	Saudi Arabia	79.099998	10.600000	6.500000	0.600000	20939.0	523050.0	543988.0	1987.0	3.108333	1.327130
32	Taiwan	29.400000	39.599998	22.400000	3.600000	244793.0	2888089.0	3132882.0	1987.0	2.863014	1.272738
33	Singapore	24.799999	28.200001	16.400000	4.300000	43001.0	286148.0	329149.0	1987.0	2.599156	1.219850
34	China	17.400000	52.599998	29.500000	4.900000	18285304.0	209437144.0	227722450.0	1987.0	1.931034	1.307425
35	Philippines	17.799999	47.500000	28.100000	8.700000	1450047.0	7986001.0	9436048.0	1987.0	2.596899	1.308573
36	Peru	9.500000	14.100000	10.000000	5.800000	362184.0	872296.0	1234480.0	1987.0	2.938272	1.279030
37	Colombia	24.799999	21.000000	14.400000	8.100000	815929.0	2016256.0	2832185.0	1987.0	2.795580	1.223466
38	United Arab Emirates	13.700000	25.900000	18.700001	0.700000	1971.0	189946.0	191917.0	1987.0	3.264957	1.241476
39	Ireland	19.100000	35.000000	33.200001	31.400000	403179.0	438464.0	841643.0	1987.0	2.938298	1.187716

	Unnamed: 0	HAND_WRITE	HAND_THROW	WORK_ENVIOR	SELF_RATE_SMELL	DAYS_W_PERF	PERF_GT_1	FLU_COLD	...	decage	corr1	corr2	corr3	corr4	corr5	corr6	totcorr	source	a1a6_0
0	1	2	2	1	4	1	2	0	...	3	2	1	1	1	2	1	4	4	0
1	2	2	2	1	5	2	1	1	...	4	2	1	1	1	0	1	4	4	1
2	3	2	2	6	5	4	2	1	...	8	2	1	2	1	1	1	4	4	0
3	4	2	2	3	3	1	0	1	...	2	2	2	0	1	1	1	3	4	1
4	5	1	1	1	4	1	2	1	...	6	0	2	2	2	2	2	0	4	1

Jakob Schanzer¶

Data Science Final Project¶

National Geographic Smell Perception Survey¶

To access my webpage please CLICK HERE¶

To access my webpage please CLICK HERE ¶

	Unnamed: 0	HAND_WRITE	HAND_THROW	WORK_ENVIOR	SELF_RATE_SMELL	DAYS_W_PERF	PERF_GT_1	FLU_COLD	...	decage	corr1	corr2	corr3	corr4	corr5	corr6	totcorr	source	a1a6_0
0	1	2	2	1	4	1	2	0	...	3	2	1	1	1	2	1	4	4	0
1	2	2	2	1	5	2	1	1	...	4	2	1	1	1	0	1	4	4	1
2	3	2	2	6	5	4	2	1	...	8	2	1	2	1	1	1	4	4	0
3	4	2	2	3	3	1	0	1	...	2	2	2	0	1	1	1	3	4	1
4	5	1	1	1	4	1	2	1	...	6	0	2	2	2	2	2	0	4	1