import os

# For analysis
import pandas as pd
import numpy as np
import csv

# For visualization
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns


dset_2018 = pd.read_csv(os.path.join(os.getcwd(), 'archive', 'IT_Salary_Survey_EU_2018.csv'))
dset_2019 = pd.read_csv(os.path.join(os.getcwd(), 'archive', 'IT_Salary_Survey_EU_2019.csv'))
dset_2020 = pd.read_csv(os.path.join(os.getcwd(), 'archive', 'IT_Salary_Survey_EU_2020.csv'))

data_dict = {'2018' : dset_2018, '2019' : dset_2019, '2020' : dset_2020 }


print("Shape of dataset for each year")
for key, value in data_dict.items():
    print('Year : ', key, " - \t Shape of dataset: ",value.shape )

Shape of dataset for each year
Year :  2018  - 	 Shape of dataset:  (765, 14)
Year :  2019  - 	 Shape of dataset:  (991, 23)
Year :  2020  - 	 Shape of dataset:  (1253, 23)


dset_2018.head()


dset_2019.head()


dset_2020.head()


print("\n")
print("Available Columns for 2018 : ",dset_2018.columns.tolist(),"\n")
print("Available Columns for 2019 : ",dset_2019.columns.tolist(),"\n")
print("Available Columns for 2020 : ",dset_2020.columns.tolist(),"\n")


Available Columns for 2018 :  ['Timestamp', 'Age', 'Gender', 'City', 'Position', 'Years of experience', 'Your level', 'Current Salary', 'Salary one year ago', 'Salary two years ago', 'Are you getting any Stock Options?', 'Main language at work', 'Company size', 'Company type'] 

Available Columns for 2019 :  ['Zeitstempel', 'Age', 'Gender', 'City', 'Seniority level', 'Position (without seniority)', 'Years of experience', 'Your main technology / programming language', 'Yearly brutto salary (without bonus and stocks)', 'Yearly bonus', 'Yearly stocks', 'Yearly brutto salary (without bonus and stocks) one year ago. Only answer if staying in same country', 'Yearly bonus one year ago. Only answer if staying in same country', 'Yearly stocks one year ago. Only answer if staying in same country', 'Number of vacation days', 'Number of home office days per month', 'Main language at work', 'Company name ', 'Company size', 'Company type', 'Сontract duration', 'Company business sector', '0'] 

Available Columns for 2020 :  ['Timestamp', 'Age', 'Gender', 'City', 'Position ', 'Total years of experience', 'Years of experience in Germany', 'Seniority level', 'Your main technology / programming language', 'Other technologies/programming languages you use often', 'Yearly brutto salary (without bonus and stocks) in EUR', 'Yearly bonus + stocks in EUR', 'Annual brutto salary (without bonus and stocks) one year ago. Only answer if staying in the same country', 'Annual bonus+stocks one year ago. Only answer if staying in same country', 'Number of vacation days', 'Employment status', 'Сontract duration', 'Main language at work', 'Company size', 'Company type', 'Have you lost your job due to the coronavirus outbreak?', 'Have you been forced to have a shorter working week (Kurzarbeit)? If yes, how many hours per week', 'Have you received additional monetary support from your employer due to Work From Home? If yes, how much in 2020 in EUR']


# Outline new dataset
merged_dset = pd.DataFrame(columns = ['Timestamp', 'Age', 'Gender', 'City', 'Position', 'Experience_yrs', 
                                      'Experience_yrs_Germany', 'Seniority_level', 'Current_salary', 
                                      'Salary_one_yr_ago', 'Yearly_stocks_n_bonus_current','Yearly_stocks_n_bonus_one_yr_ago',
                                      'Main_language_at_work', 'Company_size', 'Vacation_days','Сontract_duration',
                                      'Company_type', 'Main_tech', 'Other_tech', 'Year'] )
print('[INFO] Merge Beginning')

# Merge 2018 dataset
dset_2018['Year'] = int(2018)
column_mapping_for_dset_2018 = {'Years of experience':'Experience_yrs', 'Your level':'Seniority_level', 
                  'Current Salary' : 'Current_salary', 'Salary one year ago': 'Salary_one_yr_ago',
                  'Main language at work' :  'Main_language_at_work', 'Company size' : 'Company_size',
                  'Company type': 'Company_type', 'Timestamp':'Timestamp', 'Age':'Age',
                  'Gender':'Gender','City':'City','Position':'Position', 'Year':'Year'}

merged_dset = pd.concat([merged_dset, dset_2018[list(column_mapping_for_dset_2018.keys())].rename(columns = column_mapping_for_dset_2018)])

print('[INFO] Merged 2018 Dataset')
print('Shape of new dataset : ',merged_dset.shape)

# Merge 2019 dataset
dset_2019['Year'] = int(2019)
dset_2019['Yearly_stocks_n_bonus_current'] = dset_2019['Yearly bonus'].fillna(0)  + dset_2019['Yearly stocks'].fillna(0)
dset_2019['Yearly_stocks_n_bonus_one_yr_ago'] = dset_2019['Yearly bonus one year ago. Only answer if staying in same country'].fillna(0)  + dset_2019['Yearly stocks one year ago. Only answer if staying in same country'].fillna(0)
column_mapping_for_dset_2019 = {'Years of experience':'Experience_yrs', 'Seniority level':'Seniority_level', 
                  'Yearly brutto salary (without bonus and stocks)' : 'Current_salary', 
                  'Yearly brutto salary (without bonus and stocks) one year ago. Only answer if staying in same country': 'Salary_one_yr_ago',
                  'Yearly_stocks_n_bonus_current' :  'Yearly_stocks_n_bonus_current',
                  'Yearly_stocks_n_bonus_one_yr_ago' : 'Yearly_stocks_n_bonus_one_yr_ago',
                  'Main language at work' :  'Main_language_at_work', 'Company size' : 'Company_size',
                  'Company type': 'Company_type', 'Zeitstempel': 'Timestamp', 'Сontract duration': 'Сontract_duration',
                  'Position (without seniority)':'Position', 'Age':'Age', 'Number of vacation days' : 'Vacation_days',
                  'Gender':'Gender','City':'City',  'Year':'Year' }
merged_dset = pd.concat([merged_dset, dset_2019[list(column_mapping_for_dset_2019.keys())].rename(columns = column_mapping_for_dset_2019)])

print('[INFO] Merged 2019 Dataset')
print('Shape of new dataset : ',merged_dset.shape)


# Merge 2020 dataset
dset_2020['Year'] = int(2020)
column_mapping_for_dset_2020 = {'Total years of experience':'Experience_yrs', 'Seniority level':'Seniority_level', 
                  'Yearly brutto salary (without bonus and stocks) in EUR' : 'Current_salary', 
                  'Annual brutto salary (without bonus and stocks) one year ago. Only answer if staying in the same country': 'Salary_one_yr_ago',
                  'Yearly bonus + stocks in EUR': 'Yearly_stocks_n_bonus_current', 
                  'Annual bonus+stocks one year ago. Only answer if staying in same country': 'Yearly_stocks_n_bonus_one_yr_ago',
                  'Main language at work' :  'Main_language_at_work', 'Company size' : 'Company_size',
                  'Company type': 'Company_type', 'Timestamp': 'Timestamp',
                  'Position ':'Position', 'Age':'Age',  'Number of vacation days' : 'Vacation_days',
                  'Gender':'Gender','City':'City',  'Year':'Year', 'Сontract duration': 'Сontract_duration',
                  'Years of experience in Germany':'Experience_yrs_Germany', 
                  'Your main technology / programming language': 'Main_tech',
                  'Other technologies/programming languages you use often': 'Other_tech' }
merged_dset = pd.concat([merged_dset, dset_2020[list(column_mapping_for_dset_2020.keys())].rename(columns = column_mapping_for_dset_2020)])

print('[INFO] Merged 2020 Dataset')
print('Shape of new dataset : ',merged_dset.shape)

[INFO] Merge Beginning
[INFO] Merged 2018 Dataset
Shape of new dataset :  (765, 20)
[INFO] Merged 2019 Dataset
Shape of new dataset :  (1756, 20)
[INFO] Merged 2020 Dataset
Shape of new dataset :  (3009, 20)


merged_dset.sample(5)


merged_dset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3009 entries, 0 to 1252
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Timestamp                         3009 non-null   object 
 1   Age                               2780 non-null   float64
 2   Gender                            2985 non-null   object 
 3   City                              2980 non-null   object 
 4   Position                          2974 non-null   object 
 5   Experience_yrs                    2960 non-null   object 
 6   Experience_yrs_Germany            1221 non-null   object 
 7   Seniority_level                   2960 non-null   object 
 8   Current_salary                    2993 non-null   float64
 9   Salary_one_yr_ago                 2084 non-null   float64
 10  Yearly_stocks_n_bonus_current     1820 non-null   object 
 11  Yearly_stocks_n_bonus_one_yr_ago  1605 non-null   object 
 12  Main_language_at_work             2973 non-null   object 
 13  Company_size                      2962 non-null   object 
 14  Vacation_days                     2116 non-null   object 
 15  Сontract_duration                 2186 non-null   object 
 16  Company_type                      2918 non-null   object 
 17  Main_tech                         1126 non-null   object 
 18  Other_tech                        1096 non-null   object 
 19  Year                              3009 non-null   object 
dtypes: float64(3), object(17)
memory usage: 493.7+ KB


# Time stamp : convert to Pd.Timestamp
print("Before conversion")
print(merged_dset.Timestamp.sample(5))

# From the samples data seems to be good. Just type conversion will be enough
merged_dset.Timestamp = merged_dset.Timestamp.map(lambda x : pd.Timestamp(x))

# After conversion
print("After conversion")
print(merged_dset.Timestamp.sample(5))

Before conversion
152     02.12.2019 15:43:54
539     18/12/2018 23:13:00
659     07.12.2019 20:13:45
1144    11/12/2020 08:39:23
538     05.12.2019 09:13:33
Name: Timestamp, dtype: object
After conversion
277    2019-03-12 08:58:34
482    2020-11-25 08:05:37
1106   2020-07-12 07:09:07
749    2019-10-12 17:21:14
176    2018-12-14 17:06:24
Name: Timestamp, dtype: datetime64[ns]


# Age : convert to int
print("Before conversion")
print(merged_dset.Age.sample(5))

# Replace NaNs with 0 and convert to Int
merged_dset.Age = merged_dset.Age.fillna(0).astype(int)

print("After conversion")
print(merged_dset.Age.sample(5))

Before conversion
728    30.0
287    26.0
934    26.0
820    34.0
600    30.0
Name: Age, dtype: float64
After conversion
575    31
565    34
287    29
606    33
159    32
Name: Age, dtype: int64


# Gender : convert to Categorical
print("Before conversion")
print(merged_dset.Gender.sample(5))

# Check unique value counts in Gender
print("\nValue Counts")
print(merged_dset.Gender.value_counts(dropna = False))

# Create a map - we just need 4 categories [Male, Female, Diverse and NaN]
gender_map = {'Male': "Male", "M":"Male", "Female" :"Female", "F":"Female", "Diverse":"Diverse", np.nan:'NaN' }
merged_dset.Gender = merged_dset.Gender.map(lambda x:gender_map[x])
merged_dset.Gender = pd.Categorical(merged_dset.Gender)

print("\nAfter conversion")
print(merged_dset.Gender.sample(5))

Before conversion
903      Male
583      Male
584    Female
59       Male
321      Male
Name: Gender, dtype: object

Value Counts
Male       1887
M           646
Female      345
F           105
NaN          24
Diverse       2
Name: Gender, dtype: int64

After conversion
31     Male
515    Male
372    Male
518    Male
237    Male
Name: Gender, dtype: category
Categories (4, object): ['Diverse', 'Female', 'Male', 'NaN']


# City data looks good no change needed as of now
print("City")
print(merged_dset.City.value_counts(dropna=False))

City
Berlin                   1402
Munich                    476
München                   249
Frankfurt                 127
Amsterdam                 104
Hamburg                    90
Stuttgart                  56
Cologne                    36
NaN                        29
Köln                       21
Moscow                     19
Düsseldorf                 17
London                     14
Karlsruhe                  13
Kyiv                       12
Stockholm                  10
Zurich                     10
Stuttgart                  10
Prague                      9
Nuremberg                   8
Warsaw                      8
Vienna                      7
Eindhoven                   7
Krakow                      6
Darmstadt                   6
Tallinn                     6
Dublin                      6
Nürnberg                    6
Heidelberg                  6
Bonn                        5
Hannover                    5
Leipzig                     4
Paris                       4
Luxembourg                  4
Düsseldorf                  4
Dusseldorf                  4
Wolfsburg                   4
Helsinki                    4
Nürnberg                    3
Boeblingen                  3
Rosenheim                   3
Brussels                    3
Milan                       3
Schleswig-Holstein          3
Koblenz                     3
Ingolstadt                  3
Siegen                      3
Wroclaw                     3
Madrid                      3
Rome                        3
Mannheim                    3
Dresden                     3
Regensburg                  3
Lisbon                      3
Mannheim                    2
Walldorf                    2
Amsterdam                   2
Bayern                      2
Zug                         2
Bern                        2
Bielefeld                   2
Barcelona                   2
Saint Petersburg            2
Dublin                      2
Rotterdam                   2
Bremen                      2
Minsk                       2
Utrecht                     2
Kiev                        2
Saint-Petersburg            2
Aachen                      2
Konstanz                    2
Riga                        2
Erlangen                    2
Münster                     2
Hildesheim                  2
Limassol                    2
City in Russia              1
Paderborn                   1
Brussels                    1
Cupertino                   1
Innsbruck                   1
Salzburg                    1
Bodensee                    1
Nuremberg                   1
Lübeck                      1
Malta                       1
Basel                       1
Braunschweig                1
Hildesheim                  1
Cracovia                    1
Den Haag                    1
Würzburg                    1
Zürich                      1
Murnau am Staffelsee        1
NJ, USA                     1
Riga, Latvia                1
Samara                      1
Karlsruhe                   1
Istanbul                    1
Ulm                         1
Tuttlingen                  1
Jena                        1
France                      1
Fr                          1
Duesseldorf                 1
Cracow                      1
Dortmund                    1
Brunswick                   1
Sofia                       1
Dusseldurf                  1
Cambridge                   1
Heilbronn                   1
Warsaw, Poland              1
Copenhagen                  1
Prefer not to say           1
Friedrichshafen             1
Heidelberg                  1
Sevilla                     1
Marseille                   1
Ingolstadt                  1
Luttich                     1
Milano                      1
Bucharest                   1
Tampere (Finland)           1
Bölingen                    1
warsaw                      1
Cork                        1
Ansbach                     1
Moldova                     1
estonia                     1
Oberhausen                  1
Ireland, Dublin             1
Remote Germany              1
saint petersburg            1
homeoffice                  1
bremen                      1
Krakow, Poland              1
oslo                        1
Memmingen                   1
Alicante, Spain             1
Strasbourg                  1
Essen                       1
Ludwigsburg                 1
Poland                      1
Gdansk                      1
Bitburg                     1
Krakau                      1
Karlskrona                  1
Belgrade, Serbia            1
Brno                        1
Tübingen                    1
Frankonia                   1
Baden-Württemberg           1
Augsburg                    1
Andernach                   1
MA                          1
Freiburg                    1
Hilversum                   1
Kempten                     1
Konstanz area               1
Saint-Petersburg            1
Nizhny Novgorod             1
Lausanne                    1
Würzburg                    1
Antwerpen                   1
Tampere                     1
Almere                      1
Gothenburg                  1
Gdańsk                      1
Leeuwarden                  1
Pforzheim                   1
Eindhoven                   1
Kaiserslautern              1
Braunschweig                1
Lugano                      1
Lingen                      1
Dubai                       1
Toulouse                    1
Jyvaskyla                   1
Odesa                       1
Marburg                     1
Kassel                      1
Saarbrücken                 1
Name: City, dtype: int64


# Position data looks good no change needed as of now
print("\nPosition")
print(merged_dset.Position.value_counts(dropna=False))

Position
Backend Developer                      429
Software Engineer                      423
Data Scientist                         243
Frontend Developer                     170
DevOps                                 121
                                      ... 
Software Developer (Python Backend)      1
Business Intelligence Analyst            1
SAP Business Analyst                     1
Sr. JS Developer                         1
IT Operations Manager                    1
Name: Position, Length: 528, dtype: int64


pd.set_option('display.max_rows', 500)
# Position data looks good no change needed as of now
print("\nPosition")
print(merged_dset.Position.value_counts(dropna=False))

Position
Backend Developer                      429
Software Engineer                      423
Data Scientist                         243
Frontend Developer                     170
DevOps                                 121
                                      ... 
Software Developer (Python Backend)      1
Business Intelligence Analyst            1
SAP Business Analyst                     1
Sr. JS Developer                         1
IT Operations Manager                    1
Name: Position, Length: 528, dtype: int64


# Position data looks good no change needed as of now
print("\nPosition")
print(merged_dset.Position.value_counts(dropna=False))

Position
Backend Developer                      429
Software Engineer                      423
Data Scientist                         243
Frontend Developer                     170
DevOps                                 121
                                      ... 
Software Developer (Python Backend)      1
Business Intelligence Analyst            1
SAP Business Analyst                     1
Sr. JS Developer                         1
IT Operations Manager                    1
Name: Position, Length: 528, dtype: int64


# Experience_yrs : convert to float 
print("\nBefore conversion")
print(merged_dset.Experience_yrs.value_counts(dropna=False))

# Cleaning function
def clean_experience(val):
    
    new_val = str(val).replace(',','.')
    
    try:
        new_val = float(new_val)
    except:
#         print("new_val : ",new_val)
        try:
            new_val = float(max(re.findall(r'\d+', new_val)))
        except:
            new_val = 0.0
            
    return new_val
        
merged_dset.Experience_yrs = merged_dset.Experience_yrs.map(lambda x:clean_experience(x))

print("\nAfter conversion")
print(merged_dset.Experience_yrs.value_counts(dropna=False))

Before conversion
10.0                                                   213
5.0                                                    164
8.0                                                    148
10                                                     138
5                                                      136
7.0                                                    135
6.0                                                    123
3.0                                                    108
4.0                                                    103
6                                                       99
12.0                                                    97
8                                                       92
9.0                                                     91
15.0                                                    90
7                                                       84
11.0                                                    82
4                                                       80
2.0                                                     72
12                                                      67
3                                                       66
15                                                      63
1.0                                                     62
9                                                       60
2                                                       57
NaN                                                     49
13.0                                                    44
11                                                      43
13                                                      41
14                                                      39
20.0                                                    36
14.0                                                    33
1                                                       31
20                                                      28
18.0                                                    28
16.0                                                    27
16                                                      20
17.0                                                    16
18                                                      16
17                                                      12
19.0                                                    11
0.0                                                      9
25.0                                                     8
19                                                       7
25                                                       7
0                                                        6
22                                                       5
2.5                                                      5
30                                                       4
2.5                                                      4
22.0                                                     4
30.0                                                     3
21                                                       3
4.5                                                      2
0.8                                                      2
27                                                       2
1.5                                                      2
0.5                                                      2
23                                                       2
1,5                                                      2
28.0                                                     2
3.5                                                      2
1.5                                                      2
1 (as QA Engineer) / 11 in total                         1
2,5                                                      1
15, thereof 8 as CTO                                     1
31                                                       1
6 (not as a data scientist, but as a lab scientist)      1
383                                                      1
24                                                       1
40                                                       1
29                                                       1
28                                                       1
7.5                                                      1
6.5                                                      1
5.5                                                      1
26                                                       1
7.5                                                      1
24.0                                                     1
4.5                                                      1
21.0                                                     1
38.0                                                     1
less than year                                           1
Name: Experience_yrs, dtype: int64

After conversion
10.0     351
5.0      300
8.0      241
6.0      223
7.0      219
4.0      183
3.0      174
12.0     164
15.0     153
9.0      151
2.0      129
11.0     126
1.0       93
13.0      85
14.0      72
20.0      64
NaN       49
16.0      47
18.0      44
17.0      28
19.0      18
0.0       16
25.0      15
2.5       10
22.0       9
30.0       7
1.5        6
21.0       4
4.5        3
28.0       3
0.5        2
27.0       2
0.8        2
23.0       2
3.5        2
24.0       2
7.5        2
26.0       1
5.5        1
38.0       1
6.5        1
29.0       1
31.0       1
383.0      1
40.0       1
Name: Experience_yrs, dtype: int64


# Experience_yrs_Germany : convert to float 
print("\nBefore conversion")
print(merged_dset.Experience_yrs_Germany.value_counts(dropna=False))
    
# Clean the data
merged_dset.Experience_yrs_Germany = merged_dset.Experience_yrs_Germany.map(lambda x:clean_experience(x))

print("\nAfter conversion")
print(merged_dset.Experience_yrs_Germany.value_counts(dropna=False))

Before conversion
NaN                                                    1788
2                                                       195
1                                                       189
3                                                       155
5                                                       142
4                                                       122
0                                                        99
6                                                        70
7                                                        37
10                                                       29
1.5                                                      24
9                                                        20
8                                                        17
0.5                                                      13
12                                                       12
2.5                                                      10
3.5                                                       9
15                                                        8
20                                                        7
11                                                        7
0,5                                                       5
1,5                                                       5
14                                                        5
13                                                        4
30                                                        3
2,5                                                       2
19                                                        2
0.25                                                      2
18                                                        2
3,5                                                       2
3 months                                                  1
⁰                                                         1
<1                                                        1
3 (in Poland)                                             1
0,3                                                       1
-                                                         1
2.6                                                       1
0.1                                                       1
4.5                                                       1
6 (not as a data scientist, but as a lab scientist)       1
0.2                                                       1
4 (in Switzerland), 0 (in Germany)                        1
26                                                        1
4 month                                                   1
17                                                        1
16                                                        1
< 1                                                       1
25                                                        1
0.9                                                       1
0.4                                                       1
1,7                                                       1
0.8                                                       1
4,5                                                       1
less than year                                            1
Name: Experience_yrs_Germany, dtype: int64

After conversion
NaN      1788
2.00      195
1.00      191
3.00      157
5.00      142
4.00      124
0.00      102
6.00       71
7.00       37
10.00      29
1.50       29
9.00       20
0.50       18
8.00       17
12.00      12
2.50       12
3.50       11
15.00       8
11.00       7
20.00       7
14.00       5
13.00       4
30.00       3
4.50        2
18.00       2
0.25        2
19.00       2
17.00       1
0.30        1
2.60        1
0.10        1
26.00       1
0.90        1
16.00       1
25.00       1
0.40        1
1.70        1
0.80        1
0.20        1
Name: Experience_yrs_Germany, dtype: int64


# Experience_yrs_Germany : convert to float 
print("\nBefore conversion")
print(merged_dset.Seniority_level.value_counts(dropna=False))
    
# Map the seniority level -> New categories : ['Senior','Middle' ,'Lead', 'Junior' , 'Head', 'NaN','Other']
seniority_list = ['Senior','Middle' ,'Lead', 'Junior' , 'Head']

def seniority_map(x):
    if x in seniority_list:
        return x
    else:
        return 'Other'
    
merged_dset.Seniority_level = merged_dset.Seniority_level.map(lambda x: seniority_map(x) )
merged_dset.Seniority_level = pd.Categorical(merged_dset.Seniority_level, 
                                             categories = ['Other','Junior','Middle','Senior','Lead','Head'], ordered = True)
print("\nAfter conversion")
print(merged_dset.Seniority_level.value_counts(dropna=False))

Before conversion
Senior                                       1649
Middle                                        844
Lead                                          201
Junior                                        192
Head                                           50
NaN                                            49
Principal                                       6
intern                                          1
student                                         1
Intern                                          1
Entry level                                     1
Working Student                                 1
C-Level                                         1
no idea, there are no ranges in the firm        1
Student                                         1
No level                                        1
C-level executive manager                       1
Key                                             1
Director                                        1
CTO                                             1
Work Center Manager                             1
Manager                                         1
VP                                              1
No level                                        1
Self employed                                   1
Name: Seniority_level, dtype: int64

After conversion
Senior    1649
Middle     844
Lead       201
Junior     192
Other       73
Head        50
Name: Seniority_level, dtype: int64


merged_dset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3009 entries, 0 to 1252
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   Timestamp                         3009 non-null   datetime64[ns]
 1   Age                               3009 non-null   int64         
 2   Gender                            3009 non-null   category      
 3   City                              2980 non-null   object        
 4   Position                          2974 non-null   object        
 5   Experience_yrs                    2960 non-null   float64       
 6   Experience_yrs_Germany            1221 non-null   float64       
 7   Seniority_level                   3009 non-null   category      
 8   Current_salary                    2993 non-null   float64       
 9   Salary_one_yr_ago                 2084 non-null   float64       
 10  Yearly_stocks_n_bonus_current     1820 non-null   object        
 11  Yearly_stocks_n_bonus_one_yr_ago  1605 non-null   object        
 12  Main_language_at_work             2973 non-null   object        
 13  Company_size                      2962 non-null   object        
 14  Vacation_days                     2116 non-null   object        
 15  Сontract_duration                 2186 non-null   object        
 16  Company_type                      2918 non-null   object        
 17  Main_tech                         1126 non-null   object        
 18  Other_tech                        1096 non-null   object        
 19  Year                              3009 non-null   object        
dtypes: category(2), datetime64[ns](1), float64(4), int64(1), object(12)
memory usage: 452.9+ KB


# Current Salary - Looks good for now
print("\nBefore conversion")
print(merged_dset.Current_salary.sample(5))

Before conversion
346    60000.0
423    70000.0
666    48000.0
647    70000.0
191    65000.0
Name: Current_salary, dtype: float64


# Salary_one_yr_ago - Looks good for now
print("\nBefore conversion")
print(merged_dset.Salary_one_yr_ago.sample(5))

Before conversion
642    60000.0
107    41000.0
951    75000.0
389    53000.0
189    44000.0
Name: Salary_one_yr_ago, dtype: float64


# Salary_one_yr_ago - convert to float
print("\nBefore conversion")
print(merged_dset.Yearly_stocks_n_bonus_current.value_counts(dropna = False))

merged_dset.Yearly_stocks_n_bonus_current = merged_dset.Yearly_stocks_n_bonus_current.map(lambda x:clean_experience(x))

print("\nAfter conversion")
print(merged_dset.Yearly_stocks_n_bonus_current.value_counts(dropna=False))

Before conversion
NaN           1189
0.0            517
0              227
5000            56
5000.0          54
10000           45
2000            36
3000.0          29
10000.0         27
1.0             26
6000            26
1000            23
3000            23
20000           21
6000.0          19
4000            18
15000           16
7000            15
4000.0          15
7000.0          14
2000.0          13
8000            13
75000           11
70000           11
12000           11
8000.0           9
15000.0          9
12000.0          8
20000.0          8
30000            8
1000.0           8
101              8
22000.0          8
9000.0           7
100000           7
40000            7
1500             7
25000.0          6
5500.0           6
80000            6
60000            6
11000.0          6
50000            6
25000            6
200000           5
55000.0          5
500              5
65000            5
2500.0           5
7500             5
120000           5
55000            5
30000.0          5
45000            5
4500.0           4
2.0              4
28000.0          4
1500.0           4
7500.0           4
11000            4
90000            4
35000            4
9000             4
60000.0          4
50000.0          4
77000            4
18000            4
95000            4
17000            3
4500             3
32000.0          3
150000           3
85000            3
300.0            3
500.0            3
62000            3
73000            3
37000.0          3
18000.0          3
27000.0          3
36000.0          3
80000.0          3
16000.0          3
14000.0          3
10001.0          2
13000            2
5800.0           2
17000.0          2
7001.0           2
2500             2
72000            2
45000.0          2
13001.0          2
6800.0           2
48000.0          2
0.1              2
15900            2
6300.0           2
7800.0           2
8500.0           2
56000            2
10.0             2
99000            2
800              2
91000.0          2
38000.0          2
33000.0          2
78000            2
5001.0           2
6500             2
16000            2
1400.0           2
14000            2
13000.0          2
11400.0          2
43000.0          2
41000.0          2
20700            1
70500            1
73.0             1
59000            1
58000.0          1
63909            1
1150000          1
450000           1
2400             1
4300             1
13500            1
38500            1
3400             1
14500            1
43500            1
50400            1
88000            1
28800            1
86000            1
77400            1
12               1
74000            1
19200            1
3520             1
96000            1
47500            1
45600            1
68500.0          1
3800.0           1
11900            1
bvg only         1
20240.0          1
89000            1
10500.0          1
1600             1
5500             1
4250             1
3600             1
102600           1
28000            1
53500            1
82500            1
36000            1
2800             1
107000           1
12500            1
-                1
125000           1
80400            1
7200             1
3800             1
31500            1
9450             1
63000            1
25               1
45500            1
92000            1
1200             1
1250             1
Not sure         1
110000           1
700              1
16320            1
105000           1
66447            1
82000            1
7100             1
66500            1
3375             1
depends          1
4700.0           1
6710             1
23000            1
97000            1
1100             1
15000+-          1
2200.0           1
37968            1
4400             1
5000000000       1
170000           1
84000            1
21000            1
87250            1
71000            1
106000           1
43000            1
Na               1
60847.32         1
3250             1
54000            1
58000            1
5400             1
350000.0         1
5600             1
122.13           1
39000.0          1
3200.0           1
8900.0           1
40000.0          1
52000.0          1
400.0            1
5180.0           1
20001.0          1
33500.0          1
12150.0          1
8800.0           1
103500.0         1
4170.0           1
73000.0          1
4416.0           1
3300.0           1
20000.1          1
24000.0          1
14300.0          1
35000.0          1
1550.0           1
2400.0           1
120000.0         1
19000.0          1
750000.0         1
115000.0         1
75000.0          1
26500.0          1
125.0            1
12001.0          1
5300.0           1
6250.0           1
9600.0           1
142000.0         1
4100.0           1
23000.0          1
3001.0           1
25001.0          1
2300.0           1
19000.01         1
47000.0          1
24500.0          1
2501.0           1
7200.0           1
6500.0           1
5301.0           1
21000.0          1
165000.0         1
84000.0          1
4300.0           1
58800.0          1
600.0            1
3060.0           1
26000            1
10300            1
6900             1
4800.0           1
4700             1
7900.0           1
140000           1
22500            1
112000           1
7300             1
32000            1
17800            1
12150            1
3500.0           1
0.01             1
3500             1
63500            1
20240            1
81200            1
5230.0           1
46000            1
400              1
15010.0          1
8500             1
800.0            1
14500.0          1
8424.0           1
31000.0          1
8052.0           1
12500.0          1
9700.0           1
9400             1
90000.0          1
46000.0          1
4650.0           1
108000.0         1
22500.0          1
7750             1
35200            1
19500.0          1
5.0              1
10100.0          1
16500.0          1
5800             1
6300             1
5360.0           1
22000            1
67000            1
40950            1
Name: Yearly_stocks_n_bonus_current, dtype: int64

After conversion
NaN             1189
0.000000e+00     749
5.000000e+03     110
1.000000e+04      72
3.000000e+03      52
2.000000e+03      49
6.000000e+03      45
4.000000e+03      33
1.000000e+03      31
2.000000e+04      29
7.000000e+03      29
1.500000e+04      26
1.000000e+00      26
8.000000e+03      22
1.200000e+04      19
3.000000e+04      13
7.500000e+04      12
2.500000e+04      12
7.000000e+04      11
1.500000e+03      11
9.000000e+03      11
5.500000e+04      10
5.000000e+04      10
6.000000e+04      10
1.100000e+04      10
7.500000e+03       9
8.000000e+04       9
2.200000e+04       9
4.000000e+04       8
5.000000e+02       8
1.010000e+02       8
2.500000e+03       7
1.000000e+05       7
5.500000e+03       7
1.800000e+04       7
4.500000e+04       7
4.500000e+03       7
1.200000e+05       6
2.000000e+05       5
1.700000e+04       5
9.000000e+04       5
1.400000e+04       5
2.800000e+04       5
6.500000e+04       5
1.600000e+04       5
3.500000e+04       5
7.300000e+04       4
3.200000e+04       4
2.000000e+00       4
3.600000e+04       4
1.300000e+04       4
7.700000e+04       4
9.500000e+04       4
3.700000e+04       3
4.300000e+04       3
8.500000e+04       3
8.000000e+02       3
8.500000e+03       3
2.700000e+04       3
6.500000e+03       3
1.500000e+05       3
6.300000e+03       3
6.200000e+04       3
3.000000e+02       3
5.800000e+03       3
4.100000e+04       2
4.300000e+03       2
1.590000e+04       2
2.300000e+04       2
7.200000e+04       2
1.215000e+04       2
5.600000e+04       2
8.400000e+04       2
9.900000e+04       2
1.000000e-01       2
2.100000e+04       2
7.200000e+03       2
4.000000e+02       2
2.400000e+03       2
7.800000e+04       2
1.000000e+01       2
5.001000e+03       2
1.140000e+04       2
3.300000e+04       2
2.250000e+04       2
1.450000e+04       2
4.700000e+03       2
7.001000e+03       2
4.600000e+04       2
1.300100e+04       2
1.250000e+04       2
4.800000e+04       2
1.000100e+04       2
2.024000e+04       2
7.800000e+03       2
1.400000e+03       2
6.800000e+03       2
3.800000e+04       2
3.800000e+03       2
3.500000e+03       2
9.100000e+04       2
5.800000e+04       2
1.600000e+03       1
1.200000e+03       1
1.350000e+04       1
9.200000e+04       1
4.550000e+04       1
4.750000e+04       1
3.400000e+03       1
3.850000e+04       1
1.190000e+04       1
1.150000e+06       1
4.560000e+04       1
6.084732e+04       1
9.600000e+04       1
7.000000e+02       1
4.350000e+04       1
1.050000e+05       1
1.632000e+04       1
5.040000e+04       1
8.800000e+04       1
2.880000e+04       1
8.600000e+04       1
3.520000e+03       1
7.740000e+04       1
1.200000e+01       1
1.100000e+05       1
7.400000e+04       1
1.920000e+04       1
5.900000e+04       1
1.250000e+03       1
1.250000e+05       1
2.500000e+01       1
6.300000e+04       1
6.644700e+04       1
9.450000e+03       1
9.700000e+04       1
1.100000e+03       1
3.796800e+04       1
8.200000e+04       1
4.400000e+03       1
6.650000e+04       1
1.700000e+05       1
5.000000e+09       1
5.400000e+04       1
8.725000e+04       1
3.250000e+03       1
7.100000e+04       1
5.600000e+03       1
3.600000e+03       1
5.350000e+04       1
6.710000e+03       1
2.800000e+03       1
3.150000e+04       1
6.390900e+04       1
8.040000e+04       1
1.060000e+05       1
7.050000e+04       1
1.070000e+05       1
4.500000e+05       1
3.375000e+03       1
8.250000e+04       1
2.070000e+04       1
4.250000e+03       1
8.900000e+04       1
1.026000e+05       1
7.100000e+03       1
5.400000e+03       1
5.180000e+03       1
1.221300e+02       1
9.700000e+03       1
1.420000e+05       1
6.250000e+03       1
3.500000e+05       1
2.650000e+04       1
1.250000e+02       1
1.200100e+04       1
5.300000e+03       1
9.600000e+03       1
4.100000e+03       1
3.001000e+03       1
2.500100e+04       1
1.900001e+04       1
4.700000e+04       1
2.450000e+04       1
2.501000e+03       1
8.800000e+03       1
1.080000e+05       1
8.120000e+04       1
5.000000e+00       1
1.650000e+04       1
1.950000e+04       1
4.650000e+03       1
6.850000e+04       1
2.200000e+03       1
7.300000e+01       1
1.050000e+04       1
5.230000e+03       1
1.000000e-02       1
3.060000e+03       1
7.900000e+03       1
4.800000e+03       1
1.501000e+04       1
5.360000e+03       1
1.010000e+04       1
5.301000e+03       1
1.650000e+05       1
2.300000e+03       1
1.150000e+05       1
8.424000e+03       1
3.100000e+04       1
8.052000e+03       1
9.400000e+03       1
3.520000e+04       1
6.700000e+04       1
7.750000e+03       1
1.780000e+04       1
2.600000e+04       1
1.030000e+04       1
6.900000e+03       1
1.400000e+05       1
1.120000e+05       1
7.300000e+03       1
6.350000e+04       1
6.000000e+02       1
5.880000e+04       1
1.550000e+03       1
2.000100e+04       1
7.500000e+05       1
1.900000e+04       1
3.900000e+04       1
3.200000e+03       1
8.900000e+03       1
5.200000e+04       1
3.350000e+04       1
1.430000e+04       1
1.035000e+05       1
4.170000e+03       1
4.416000e+03       1
3.300000e+03       1
2.000010e+04       1
2.400000e+04       1
4.095000e+04       1
Name: Yearly_stocks_n_bonus_current, dtype: int64


# Yearly_stocks_n_bonus_one_yr_ago - convert to float
print("\nBefore conversion")
print(merged_dset.Yearly_stocks_n_bonus_one_yr_ago.value_counts(dropna = False))

merged_dset.Yearly_stocks_n_bonus_one_yr_ago = merged_dset.Yearly_stocks_n_bonus_one_yr_ago.map(lambda x:clean_experience(x))

print("\nAfter conversion")
print(merged_dset.Yearly_stocks_n_bonus_one_yr_ago.value_counts(dropna=False))

Before conversion
NaN                                1404
0.0                                 741
0                                   200
5000                                 32
5000.0                               26
10000                                25
3000.0                               17
10000.0                              15
60000                                15
2.0                                  14
1000                                 13
3000                                 13
6000                                 12
7000                                 12
2000                                 11
8000.0                               11
1.0                                  11
6000.0                               11
4000                                 10
80000                                10
4000.0                               10
20000                                 9
65000                                 9
50000                                 9
70000                                 8
2000.0                                8
30000                                 7
12000.0                               7
15000                                 6
500                                   6
1000.0                                6
8000                                  5
11000.0                               5
2500                                  5
1500                                  5
12000                                 5
3500                                  5
78000                                 5
9000.0                                4
55000                                 4
500.0                                 4
1500.0                                4
30000.0                               4
7000.0                                4
11000                                 4
54000                                 4
64000                                 4
75000                                 4
25000.0                               4
101                                   4
90000                                 3
120000                                3
35000                                 3
5500                                  3
85000                                 3
72000                                 3
100000                                3
92000                                 3
40000                                 3
67000                                 3
25000                                 3
7500                                  3
15000.0                               3
17000                                 3
50000.0                               3
45000                                 3
46000                                 3
1800                                  2
13000                                 2
31000                                 2
600                                   2
48000                                 2
2200.0                                2
18000                                 2
59000                                 2
200000                                2
82000                                 2
63000                                 2
6200                                  2
5400                                  2
8500                                  2
300                                   2
8500.0                                2
58000                                 2
6800                                  2
56000                                 2
53000                                 2
90000.0                               2
20000.0                               2
62000                                 2
6500.0                                2
1200.0                                2
700                                   2
87000                                 2
32000.0                               2
5700.0                                2
37500                                 2
35000.0                               2
22000.0                               2
39000                                 1
100                                   1
2004                                  1
4800                                  1
10400.0                               1
-                                     1
learning budget, bvg, gym, food       1
94000                                 1
31500                                 1
3800                                  1
22500                                 1
73000                                 1
2760                                  1
34000                                 1
3300                                  1
16300                                 1
1100                                  1
7200                                  1
115000                                1
180000                                1
16000                                 1
105000                                1
12500                                 1
9500                                  1
28500                                 1
36000                                 1
800                                   1
89000                                 1
50400                                 1
67200                                 1
9000                                  1
84000                                 1
22000                                 1
76000                                 1
98000                                 1
6900                                  1
66000                                 1
23000                                 1
75009                                 1
2600                                  1
3900                                  1
50000000                              1
8100                                  1
86000                                 1
47000                                 1
41000                                 1
14000                                 1
42000                                 1
7050                                  1
55200                                 1
74800                                 1
300000                                1
60000.0                               1
28800                                 1
21000.0                               1
18000.0                               1
0.1                                   1
150000.0                              1
19000.0                               1
360.0                                 1
5200.0                                1
45000.0                               1
6800.0                                1
7500.0                                1
520000.0                              1
7700.0                                1
400.0                                 1
300.0                                 1
19001.0                               1
6001.0                                1
13000.0                               1
18800                                 1
35.0                                  1
10500.0                               1
0.01                                  1
7400.0                                1
59000.0                               1
5800.0                                1
8131.0                                1
10.0                                  1
20001.0                               1
6200.0                                1
1100.0                                1
5250.0                                1
3500.0                                1
21500.0                               1
800.0                                 1
48000.0                               1
7001.0                                1
16000.0                               1
61000                                 1
10001.0                               1
54.0                                  1
122000                                1
33000                                 1
7800.0                                1
4600.0                                1
77000                                 1
1300.0                                1
6300                                  1
49000                                 1
68000                                 1
52000                                 1
850                                   1
57000                                 1
5500.0                                1
18240.0                               1
73000.0                               1
43500.0                               1
27000.0                               1
4500.0                                1
58000.0                               1
4300.0                                1
2500.0                                1
351.0                                 1
168000.0                              1
20000.1                               1
10850.0                               1
11500.0                               1
900.0                                 1
4400.0                                1
8005.0                                1
36400                                 1
Name: Yearly_stocks_n_bonus_one_yr_ago, dtype: int64

After conversion
NaN            1404
0.00            943
5000.00          58
10000.00         40
3000.00          30
6000.00          23
4000.00          20
2000.00          19
1000.00          19
8000.00          16
7000.00          16
60000.00         16
2.00             14
12000.00         12
50000.00         12
30000.00         11
1.00             11
20000.00         11
500.00           10
80000.00         10
11000.00          9
15000.00          9
65000.00          9
1500.00           9
70000.00          8
25000.00          7
3500.00           6
2500.00           6
9000.00           5
35000.00          5
78000.00          5
90000.00          5
54000.00          4
75000.00          4
8500.00           4
64000.00          4
45000.00          4
7500.00           4
55000.00          4
101.00            4
5500.00           4
92000.00          3
300.00            3
18000.00          3
67000.00          3
22000.00          3
46000.00          3
48000.00          3
85000.00          3
120000.00         3
58000.00          3
17000.00          3
100000.00         3
72000.00          3
40000.00          3
59000.00          3
6800.00           3
6200.00           3
13000.00          3
700.00            2
5400.00           2
16000.00          2
63000.00          2
1100.00           2
73000.00          2
1200.00           2
87000.00          2
600.00            2
62000.00          2
31000.00          2
53000.00          2
200000.00         2
37500.00          2
32000.00          2
2200.00           2
6500.00           2
1800.00           2
56000.00          2
82000.00          2
800.00            2
5700.00           2
22500.00          1
2760.00           1
67200.00          1
3800.00           1
105000.00         1
4800.00           1
180000.00         1
39000.00          1
7200.00           1
100.00            1
31500.00          1
34000.00          1
3300.00           1
98000.00          1
7050.00           1
94000.00          1
12500.00          1
42000.00          1
14000.00          1
41000.00          1
47000.00          1
16300.00          1
55200.00          1
9500.00           1
86000.00          1
8100.00           1
50000000.00       1
3900.00           1
2600.00           1
75009.00          1
23000.00          1
66000.00          1
6900.00           1
76000.00          1
84000.00          1
2004.00           1
300000.00         1
89000.00          1
28500.00          1
50400.00          1
36000.00          1
4500.00           1
115000.00         1
35.00             1
21000.00          1
5200.00           1
360.00            1
19000.00          1
150000.00         1
0.10              1
6001.00           1
20001.00          1
21500.00          1
5250.00           1
10.00             1
8131.00           1
7700.00           1
5800.00           1
7400.00           1
0.01              1
10500.00          1
43500.00          1
18240.00          1
10001.00          1
54.00             1
7800.00           1
4600.00           1
10400.00          1
520000.00         1
400.00            1
74800.00          1
122000.00         1
28800.00          1
18800.00          1
57000.00          1
850.00            1
52000.00          1
68000.00          1
49000.00          1
6300.00           1
77000.00          1
61000.00          1
33000.00          1
27000.00          1
19001.00          1
4300.00           1
351.00            1
168000.00         1
20000.10          1
10850.00          1
11500.00          1
900.00            1
4400.00           1
8005.00           1
1300.00           1
7001.00           1
36400.00          1
Name: Yearly_stocks_n_bonus_one_yr_ago, dtype: int64


# Main_language_at_work - convert to float
print("\nBefore conversion")
print(merged_dset.Main_language_at_work.value_counts(dropna = False))

# Map the seniority level -> New categories : ['English','Deutsch' ,'German', 'Russian' , 'Other']
main_language_list = ['English','Deutsch' ,'German', 'Russian']

def language_map(x):
    if x in main_language_list:
        return x
    else:
        return 'Other'
    
merged_dset.Main_language_at_work = merged_dset.Main_language_at_work.map(lambda x: language_map(x) )
merged_dset.Main_language_at_work = pd.Categorical(merged_dset.Main_language_at_work)

print("\nAfter conversion")
print(merged_dset.Main_language_at_work.value_counts(dropna=False))

Before conversion
English                                  2354
Deutsch                                   316
German                                    186
Russian                                    78
NaN                                        36
French                                      8
Polish                                      6
Italian                                     5
Spanish                                     4
English and German                          2
Czech                                       2
Русский                                     2
both                                        1
Deuglisch                                   1
50/50                                       1
Dutch                                       1
Ukrainian                                   1
Polish+English                              1
English+Deutsch                             1
Deutsch/Englisch                            1
Team - Russian; Cross-team - English;       1
Russian, English                            1
Name: Main_language_at_work, dtype: int64

After conversion
English    2354
Deutsch     316
German      186
Russian      78
Other        75
Name: Main_language_at_work, dtype: int64


# Company_size - convert to categories
print("\nBefore conversion")
print(merged_dset.Company_size.value_counts(dropna = False))

# Map the Company Size -> New categories : ['1000+','100-1000' ,'50-100', ''10-50' , '<10', np.NaN]
Company_size_map = {'1000+' : '1000+', '100-1000':'100-1000', '101-1000' :'100-1000', '50-100':'50-100',
                    '51-100':'50-100','10-50':'10-50', '11-50':'10-50','up to 10':'<10',np.NaN : 'NA'   }
    
merged_dset.Company_size = merged_dset.Company_size.map(lambda x: Company_size_map[x] )
merged_dset.Company_size = pd.Categorical(merged_dset.Company_size, 
                                                   categories = ['1000+','100-1000' ,'50-100', '10-50' , '<10', 'NA'],
                                                   ordered = True)

print("\nAfter conversion")
print(merged_dset.Company_size.value_counts(dropna=False))

Before conversion
1000+       1013
100-1000     626
101-1000     405
50-100       252
10-50        222
11-50        174
51-100       147
up to 10     123
NaN           47
Name: Company_size, dtype: int64

After conversion
100-1000    1031
1000+       1013
50-100       399
10-50        396
<10          123
NA            47
Name: Company_size, dtype: int64


merged_dset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3009 entries, 0 to 1252
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   Timestamp                         3009 non-null   datetime64[ns]
 1   Age                               3009 non-null   int64         
 2   Gender                            3009 non-null   category      
 3   City                              2980 non-null   object        
 4   Position                          2974 non-null   object        
 5   Experience_yrs                    2960 non-null   float64       
 6   Experience_yrs_Germany            1221 non-null   float64       
 7   Seniority_level                   3009 non-null   category      
 8   Current_salary                    2993 non-null   float64       
 9   Salary_one_yr_ago                 2084 non-null   float64       
 10  Yearly_stocks_n_bonus_current     1820 non-null   float64       
 11  Yearly_stocks_n_bonus_one_yr_ago  1605 non-null   float64       
 12  Main_language_at_work             3009 non-null   category      
 13  Company_size                      3009 non-null   category      
 14  Vacation_days                     2116 non-null   object        
 15  Сontract_duration                 2186 non-null   object        
 16  Company_type                      2918 non-null   object        
 17  Main_tech                         1126 non-null   object        
 18  Other_tech                        1096 non-null   object        
 19  Year                              3009 non-null   object        
dtypes: category(4), datetime64[ns](1), float64(6), int64(1), object(8)
memory usage: 412.2+ KB


# Vacation_days - convert to int
print("\nBefore conversion")
print(merged_dset.Vacation_days.value_counts(dropna = False))
    
# Clean the data
merged_dset.Vacation_days = merged_dset.Vacation_days.map(lambda x:clean_experience(x))

print("\nAfter conversion")
print(merged_dset.Vacation_days.value_counts(dropna=False))

Before conversion
NaN                                             893
30                                              488
30.0                                            339
28                                              233
28.0                                            163
25.0                                            126
27                                              102
25                                               91
26                                               71
24                                               67
27.0                                             67
26.0                                             60
24.0                                             58
29.0                                             27
20.0                                             25
29                                               24
20                                               13
32.0                                             12
21                                               10
22                                                8
32                                                8
31                                                8
31.0                                              8
35.0                                              6
22.0                                              6
35                                                5
36                                                5
40.0                                              4
40                                                4
21.0                                              4
23                                                4
0                                                 4
14                                                3
45.0                                              3
33                                                3
33.0                                              3
45                                                3
23.0                                              3
unlimited                                         3
unlimited                                         2
15                                                2
12                                                2
Unlimited                                         2
37.0                                              2
15.0                                              2
38                                                1
24 labour days                                    1
~25                                               1
100.0                                             1
50                                                1
23+                                               1
99                                                1
4.0                                               1
37.5                                              1
3                                                 1
1                                                 1
5                                                 1
37                                                1
39                                                1
34                                                1
75.0                                              1
16                                                1
9.0                                               1
27.5                                              1
Unlimited                                         1
22.5                                              1
30 in contract (but theoretically unlimited)      1
(no idea)                                         1
28.5                                              1
25.5                                              1
38.0                                              1
60.0                                              1
36.0                                              1
14.0                                              1
5.0                                               1
365                                               1
60                                                1
10                                                1
Name: Vacation_days, dtype: int64

After conversion
NaN      893
30.0     828
28.0     396
25.0     218
27.0     169
26.0     131
24.0     126
29.0      51
20.0      38
32.0      20
31.0      16
22.0      14
21.0      14
0.0       13
35.0      11
23.0       8
40.0       8
45.0       6
36.0       6
33.0       6
15.0       4
14.0       4
37.0       3
60.0       2
5.0        2
38.0       2
12.0       2
16.0       1
34.0       1
39.0       1
1.0        1
37.5       1
99.0       1
50.0       1
3.0        1
9.0        1
4.0        1
365.0      1
25.5       1
28.5       1
22.5       1
27.5       1
75.0       1
100.0      1
10.0       1
Name: Vacation_days, dtype: int64


# Сontract_duration - convert to categories
print("\nBefore conversion")
print(merged_dset.Сontract_duration.value_counts(dropna = False))
    

# Map the Company Duration -> New categories : ['Unlimited','100-1000' ,'Temporary', '1+ year' , '<=1 year', 'NA']
Сontract_duration_map = {'Unlimited contract' : 'Unlimited', 'unlimited' :  'Unlimited', 'Temporary contract':'Temporary',
                    'more than 1 year':'1+ year','1 year':'<=1 year', '6 months':'<=1 year','3 months':'<=1 year',
                    'less than 3 months' : '<=1 year', '0': '<=1 year', np.NaN : 'NA'   }
    
merged_dset.Сontract_duration = merged_dset.Сontract_duration.map(lambda x: Сontract_duration_map[x] )
merged_dset.Сontract_duration = pd.Categorical(merged_dset.Сontract_duration, ordered = True)

print("\nAfter conversion")
print(merged_dset.Сontract_duration.value_counts(dropna=False))

Before conversion
Unlimited contract    1159
unlimited              851
NaN                    823
Temporary contract      64
more than 1 year        59
1 year                  40
6 months                 7
3 months                 4
less than 3 months       1
0                        1
Name: Сontract_duration, dtype: int64

After conversion
Unlimited    2010
NA            823
Temporary      64
1+ year        59
<=1 year       53
Name: Сontract_duration, dtype: int64


# Company Type - convert to categories
print("\nBefore conversion")
print(merged_dset.Company_type.value_counts(dropna = False))

# Map the Company_type -> New categories : ['Product','Startup' ,'Consulting / Agency', 'Other']
Company_type_list = ['Product','Startup' ,'Consulting / Agency', 'Other']

def Company_type_map(x):
    if x in Company_type_list:
        return x
    else:
        return 'Other'
    
merged_dset.Company_type = merged_dset.Company_type.map(lambda x: Company_type_map(x) )
merged_dset.Company_type = pd.Categorical(merged_dset.Company_type)

print("\nAfter conversion")
print(merged_dset.Company_type.value_counts(dropna=False))

Before conversion
Product                                                                     1830
Startup                                                                      578
Consulting / Agency                                                          259
NaN                                                                           91
Agency                                                                        74
Bodyshop / Outsource                                                          30
Bank                                                                          11
University                                                                     8
Outsource                                                                      6
Consulting                                                                     5
Corporation                                                                    4
Insurance                                                                      3
E-commerce                                                                     3
Automotive                                                                     3
Media                                                                          3
Outsourcing                                                                    3
Ecommerce                                                                      2
Consultancy                                                                    2
Retail                                                                         2
Utilities                                                                      2
Fintech                                                                        2
Finance                                                                        2
Outsorce                                                                       2
Bank                                                                           2
Industry                                                                       2
e-commerce                                                                     2
E-Commerce                                                                     2
Research                                                                       2
Corporation                                                                    1
Old industry                                                                   1
Energy                                                                         1
Publisher                                                                      1
SaaS                                                                           1
FAANG                                                                          1
Market Research                                                                1
Outsourse                                                                      1
Bloody enterprise                                                              1
freelance                                                                      1
Publishing and Technology                                                      1
Financial                                                                      1
Game Company                                                                   1
IT-Outsourcing                                                                 1
consumer goods                                                                 1
Enterprise                                                                     1
Multinational                                                                  1
Transport & Logistics                                                          1
Handel                                                                         1
Biergarten                                                                     1
Non-tech retail                                                                1
Full-time position in Education, part-time position in at a data startup       1
Pharma                                                                         1
Education                                                                      1
Consulting and Product                                                         1
corporate incubator                                                            1
service                                                                        1
Institute                                                                      1
Automotive                                                                     1
Semiconductor                                                                  1
Research institute                                                             1
Science Institute                                                              1
Concern                                                                        1
Systemhaus                                                                     1
Big Tech                                                                       1
Telecommunications                                                             1
Manufacturing                                                                  1
IT Service Provider                                                            1
IT Consultancy                                                                 1
IT Consulting                                                                  1
GmbH                                                                           1
consulting                                                                     1
telecom operator                                                               1
Stock market                                                                   1
IT Dienstleistungen                                                            1
IT-Beratung                                                                    1
Consulting Company                                                             1
Development                                                                    1
Product and Projects                                                           1
ipo                                                                            1
Personaldienstleister                                                          1
Internal development for a non-IT company                                      1
E-Commerce firm                                                                1
non-IT, manufacturing                                                          1
Technology Consulting                                                          1
IT Department of established business                                          1
Consulting (banking)                                                           1
Big commercial                                                                 1
Construction                                                                   1
Behörde                                                                        1
Cloud                                                                          1
Personal Ltd                                                                   1
Ecom retailer                                                                  1
IT Consulting                                                                  1
OEM                                                                            1
outsource                                                                      1
Project-based software development                                             1
Consult                                                                        1
Blockchain technology                                                          1
Fin Tech                                                                       1
ISP                                                                            1
Hochschule/university                                                          1
IT Consultants                                                                 1
eCommerce                                                                      1
Name: Company_type, dtype: int64

After conversion
Product                1830
Startup                 578
Other                   342
Consulting / Agency     259
Name: Company_type, dtype: int64


# Company Type - convert to categories
print("\nBefore conversion")
print(merged_dset.Main_tech.value_counts(dropna = False))

Before conversion
NaN                                                             1883
Java                                                             184
Python                                                           164
PHP                                                               56
C++                                                               38
JavaScript                                                        34
Javascript                                                        31
C#                                                                30
python                                                            26
Swift                                                             24
Scala                                                             24
Go                                                                23
Kotlin                                                            22
Python                                                            21
Ruby                                                              19
SQL                                                               14
TypeScript                                                        14
.NET                                                              13
JS                                                                12
iOS                                                               10
C                                                                 10
R                                                                  9
java                                                               9
Typescript                                                         8
Android                                                            8
php                                                                8
javascript                                                         8
.net                                                               7
Kubernetes                                                         7
JavaScript                                                         6
Javascript / Typescript                                            6
ABAP                                                               5
Elixir                                                             5
Php                                                                5
QA                                                                 4
AWS                                                                4
Golang                                                             3
Embedded                                                           3
Cloud                                                              3
Ruby on Rails                                                      3
c++                                                                3
React                                                              3
.Net                                                               3
Node.js                                                            3
go                                                                 2
Java                                                               2
Spark                                                              2
Angular                                                            2
C, C++                                                             2
Java/Kotlin                                                        2
JavaScript/Typescript                                              2
C/C++                                                              2
Sql                                                                2
JavaScript / TypeScript                                            2
Python, SQL                                                        2
Bash                                                               2
yaml                                                               2
PHP                                                                2
kotlin                                                             2
NodeJS                                                             2
Figma                                                              2
SAP                                                                2
C#, .net core                                                      2
js                                                                 2
Frontend                                                           2
Linux                                                              1
Java & PHP                                                         1
TypeScript/Angular                                                 1
golang                                                             1
Kotlin/PHP                                                         1
JS, Java                                                           1
SAP ABAP                                                           1
.net, c++, spss, embeddded                                         1
AI                                                                 1
pythin                                                             1
julia                                                              1
Java Backend                                                       1
React JS                                                           1
C++/c                                                              1
Angular, Typescript                                                1
Pyrhon                                                             1
PM tools                                                           1
Grails, Groovy                                                     1
python                                                             1
React / JavaScript                                                 1
C++, C#                                                            1
Typescript                                                         1
Офмф                                                               1
Python, cloud computing                                            1
swift                                                              1
VB, RPA, Python                                                    1
FBD                                                                1
spark                                                              1
Qlik                                                               1
Swift, Objective-C, iOS                                            1
Atlassian JIRA                                                     1
Python, Pytorch                                                    1
SAP / ABAP                                                         1
Kubrrnetes                                                         1
networking, linux, automation, cloud                               1
UML                                                                1
.Net, Angular                                                      1
Js, reactJS                                                        1
Cobol                                                              1
SRE                                                                1
Clojure                                                            1
JS, WDIO                                                           1
Terraform, Kubernetes, AWS, GCP, Ansible, Puppet                   1
Spring                                                             1
Perl                                                               1
Haskell                                                            1
Scala, Apache Spark                                                1
Kuberenetes, Openstack                                             1
Swift/Kotlin                                                       1
PL/SQL                                                             1
Java & Distributed Systems Stuff                                   1
jenkins bash                                                       1
Test Management                                                    1
NodeJS/TS                                                          1
Web                                                                1
Frontend: react, node.js                                           1
PHP/MySQL                                                          1
GCP                                                                1
JavScript                                                          1
BI, DWH, ETL/ELT                                                   1
Angular, TypeScript                                                1
React.js / TypeScript                                              1
Swift, Objective-C                                                 1
NLP, Python                                                        1
Python, Whole Apache Data Science Stack, AWS                       1
Embedded C++                                                       1
NodsJs                                                             1
Javascript/Typescript                                              1
Tricentis Tosca                                                    1
JavaScript, TypeScript                                             1
Apotheker                                                          1
c/c++                                                              1
Python, database technologies                                      1
Python (Django)                                                    1
PS, Sketch, React, CSS3                                            1
Kubernetes, Terraform, GCP                                         1
PowerShell                                                         1
Magento                                                            1
Java / Scala                                                       1
python, scala                                                      1
scala                                                              1
Django, Flask, Plotly Dash                                         1
Sql                                                                1
Javascript, Angular                                                1
SAS,SQL,Python                                                     1
Python/NLP                                                         1
Node                                                               1
Python/SQL                                                         1
php, js, python, docker                                            1
C, Matlab                                                          1
TypeScript, Kotlin                                                 1
Network Automation                                                 1
Kotlin, Java                                                       1
k8s                                                                1
TypeScript, JavaScript                                             1
C++, Java, Embedded C                                              1
DC Management                                                      1
--                                                                 1
SWIFT                                                              1
Java, angular, Aws                                                 1
Swift, objective-c                                                 1
DevOps                                                             1
Java, Javascript                                                   1
Network                                                            1
nothing                                                            1
Qlik BI Tool, SQL                                                  1
Blockchain                                                         1
Autonomous Driving                                                 1
c#                                                                 1
Js, TypeScript, Angular                                            1
T-SQL                                                              1
Java, terraform                                                    1
Pascal, VB.NET, C#                                                 1
Oracle                                                             1
Javascript                                                         1
Angular, React                                                     1
Pegasystems platform                                               1
Agile                                                              1
Erlang                                                             1
-                                                                  1
C# .NET                                                            1
AWS, GCP, Python,K8s                                               1
Aws Hadoop Postgre Typescript                                      1
Charles                                                            1
Go/Python                                                          1
С#                                                                 1
ruby on rails                                                      1
JavaScript/ES6                                                     1
Hardware                                                           1
Google Cloud Platform                                              1
Js                                                                 1
android                                                            1
VHDL                                                               1
Power BI                                                           1
none                                                               1
ML                                                                 1
Scala, React.js                                                    1
Ml/Python                                                          1
JavaScript/TypeScript                                              1
Azure, SAP                                                         1
Java, JavaScript                                                   1
NodeJS, Typescript, AWS                                            1
Java, JS                                                           1
Sketch, Figma                                                      1
several                                                            1
embedded                                                           1
Java/Groovy                                                        1
typescript                                                         1
Python + SQL                                                       1
Typescript, Web apps                                               1
Terraform                                                          1
Java, .Net                                                         1
Python, statistics, SQL                                            1
Sql, BQ, tableau, gtm, ga                                          1
React                                                              1
Business Development Manager Operation                             1
Management                                                         1
Java, Kotlin                                                       1
Nodejs                                                             1
TS                                                                 1
sql                                                                1
Spark, Pytorch                                                     1
java/scala/go/clouds/devops                                        1
Linux/UNIX, GIT, Virtualisation Platforms, *shell(s) scripts       1
DWH                                                                1
Azure                                                              1
Android/Kotlin                                                     1
Linux Kernel                                                       1
Objective-C                                                        1
C#/.NET                                                            1
C++/C#                                                             1
Typescript / Angular                                               1
React/Typescript                                                   1
Golang                                                             1
Salesforce                                                         1
Java/C++                                                           1
Scala / Python                                                     1
Java/Scala                                                         1
Web developer                                                      1
TypeScript, React                                                  1
Python / JavaScript (React)                                        1
JavaScript / typescript                                            1
Terraform                                                          1
JAVA                                                               1
Apache Spark                                                       1
Computer Networking,  Network Security                             1
SAP BW / ABAP                                                      1
Go, PHP, SQL                                                       1
Kotlin                                                             1
Jira                                                               1
Qml                                                                1
consumer analysis                                                  1
Name: Main_tech, dtype: int64

IT Salary survey¶

Data Compilation¶

Read dataset for 2018, 2019, 2020¶

Comparing datasets¶

Merge Datasets¶

Data Cleaning¶

Clean each column¶

	Timestamp	Age	Gender	City	Position	Years of experience	Your level	Current Salary	Salary one year ago	Salary two years ago	Are you getting any Stock Options?	Main language at work	Company size	Company type
0	14/12/2018 12:41:33	43.0	M	München	QA Ingenieur	11.0	Senior	77000.0	76200.0	68000.0	No	Deutsch	100-1000	Product
1	14/12/2018 12:42:09	33.0	F	München	Senior PHP Magento developer	8.0	Senior	65000.0	55000.0	55000.0	No	Deutsch	50-100	Product
2	14/12/2018 12:47:36	32.0	M	München	Software Engineer	10.0	Senior	88000.0	73000.0	54000.0	No	Deutsch	1000+	Product
3	14/12/2018 12:50:15	25.0	M	München	Senior Frontend Developer	6.0	Senior	78000.0	55000.0	45000.0	Yes	English	1000+	Product
4	14/12/2018 12:50:31	39.0	M	München	UX Designer	10.0	Senior	69000.0	60000.0	52000.0	No	English	100-1000	Ecom retailer

	Zeitstempel	Age	Gender	City	Seniority level	Position (without seniority)	Years of experience	Your main technology / programming language	Yearly brutto salary (without bonus and stocks)	Yearly bonus	...	Yearly stocks one year ago. Only answer if staying in same country	Number of vacation days	Number of home office days per month	Main language at work	Company name	Company size	Company type	Сontract duration	Company business sector	0
0	02.12.2019 11:18:26	33.0	Male	Berlin	Senior	Fullstack Developer	13	PHP	64000.0	1000.0	...	NaN	29.0	4.0	English	NaN	50-100	Startup	unlimited	Tourism	NaN
1	02.12.2019 11:18:35	29.0	Male	Berlin	Middle	Backend Developer	3	Python	55000.0	NaN	...	NaN	22.0	4.0	English	NaN	10-50	Product	unlimited	Scientific Activities	NaN
2	02.12.2019 11:18:56	NaN	Male	Berlin	Middle	Mobile Developer	4	Kotlin	70000.0	NaN	...	NaN	27.0	NaN	English	NaN	1000+	Startup	unlimited	NaN	NaN
3	02.12.2019 11:19:08	30.0	Male	Berlin	Senior	Backend Developer	6	PHP	63000.0	NaN	...	NaN	24.0	NaN	English	Auto1	100-1000	Product	unlimited	Transport	NaN
4	02.12.2019 11:19:37	32.0	Male	Berlin	Senior	Embedded Developer	10	C/C++	66000.0	NaN	...	NaN	30.0	0.0	English	Luxoft	50-100	Product	unlimited	Automotive	NaN

	Timestamp	Age	Gender	City	Position	Total years of experience	Years of experience in Germany	Seniority level	Your main technology / programming language	Other technologies/programming languages you use often	...	Annual bonus+stocks one year ago. Only answer if staying in same country	Number of vacation days	Employment status	Сontract duration	Main language at work	Company size	Company type	Have you lost your job due to the coronavirus outbreak?	Have you been forced to have a shorter working week (Kurzarbeit)? If yes, how many hours per week	Have you received additional monetary support from your employer due to Work From Home? If yes, how much in 2020 in EUR
0	24/11/2020 11:14:15	26.0	Male	Munich	Software Engineer	5	3	Senior	TypeScript	Kotlin, Javascript / Typescript	...	10000	30	Full-time employee	Unlimited contract	English	51-100	Product	No	NaN	NaN
1	24/11/2020 11:14:16	26.0	Male	Berlin	Backend Developer	7	4	Senior	Ruby	NaN	...	5000	28	Full-time employee	Unlimited contract	English	101-1000	Product	No	NaN	NaN
2	24/11/2020 11:14:21	29.0	Male	Berlin	Software Engineer	12	6	Lead	Javascript / Typescript	Javascript / Typescript, Docker	...	100000	30	Self-employed (freelancer)	Temporary contract	English	101-1000	Product	Yes	NaN	NaN
3	24/11/2020 11:15:24	28.0	Male	Berlin	Frontend Developer	4	1	Junior	Javascript	NaN	...	NaN	24	Full-time employee	Unlimited contract	English	51-100	Startup	No	NaN	NaN
4	24/11/2020 11:15:46	37.0	Male	Berlin	Backend Developer	17	6	Senior	C# .NET	.NET, SQL, AWS, Docker	...	NaN	29	Full-time employee	Unlimited contract	English	101-1000	Product	No	NaN	NaN

	Timestamp	Age	Gender	City	Position	Experience_yrs	Experience_yrs_Germany	Seniority_level	Current_salary	Salary_one_yr_ago	Yearly_stocks_n_bonus_current	Yearly_stocks_n_bonus_one_yr_ago	Main_language_at_work	Company_size	Vacation_days	Сontract_duration	Company_type	Main_tech	Other_tech	Year
30	02.12.2019 11:32:59	32.0	Male	Berlin	Software Architect	13.0	NaN	Senior	78600.0	78600.0	0.0	0.0	English	100-1000	30.0	unlimited	Product	NaN	NaN	2019
847	27/11/2020 14:14:20	28.0	Male	Warsaw, Poland	Software Engineer	6	3 (in Poland)	Middle	52800.0	47500.0	4400	3900	English	51-100	26	Unlimited contract	Consulting / Agency	C#	Javascript / Typescript, .NET, SQL	2020
884	19.12.2019 10:53:57	28.0	Male	Koblenz	Frontend Developer	2.0	NaN	Junior	48000.0	NaN	0.0	0.0	Deutsch	1000+	28.0	unlimited	Product	NaN	NaN	2019
513	18/12/2018 15:36:09	37.0	M	Stuttgart	DevOps (MS Stack)	6.0	NaN	Senior	58500.0	12000.0	NaN	NaN	Deutsch	100-1000	NaN	NaN	Product	NaN	NaN	2018
343	03.12.2019 16:16:58	NaN	Male	Frankfurt	Backend Developer	8.0	NaN	Senior	108000.0	NaN	36000.0	0.0	NaN	100-1000	28.0	unlimited	NaN	NaN	NaN	2019