import os
import sys
import pathlib
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import survey_pipeline.analysis_utils as utils

print("Python version:", sys.version.split()[0])
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("matplotlib version:", plt.matplotlib.__version__)

%load_ext autoreload
%autoreload 2

Python version: 3.12.3
pandas version: 2.3.3
numpy version: 2.3.4
matplotlib version: 3.10.7

paths = utils.get_paths()
print("Review Paths: ")
print(paths.raw_xlsx)
print(paths.ddict)
print(paths.interim)
print(paths.colors)
print(paths.processed)

Review Paths: 
/home/autrpy/projects/2025-survey-analysis-pipeline/data/0_raw/msforms_dummy_survey.xlsx
/home/autrpy/projects/2025-survey-analysis-pipeline/data/0_raw/msforms_dummy_survey_data_dictionary.csv
/home/autrpy/projects/2025-survey-analysis-pipeline/data/1_interim
/home/autrpy/projects/2025-survey-analysis-pipeline/data/0_raw/survey_plot_color_dictionary.csv
/home/autrpy/projects/2025-survey-analysis-pipeline/data/2_processed

df_raw = pd.read_excel(paths.raw_xlsx, parse_dates=["Start time", "Completion time"])
df_ddict = pd.read_csv(paths.ddict)
display(df_raw.head())
display(df_ddict.head())

df_raw_c = df_raw.copy()
# cleaning here... if there were some
display(df_raw_c.info())
print("\n\n")
display(df_raw_c["What is your organization?"].value_counts())
print("\n\n")
display(df_raw_c["What is your area of expertise?"].value_counts())
print("\n\n")
display(df_raw_c["What is your current role level?"].value_counts())
print("\n\n")
tool_q = ["Which tools do you use regularly? (Select all that apply)"]
df_raw_c.loc[:, tool_q] = df_raw_c.loc[:, tool_q].replace("; ", ";", regex=True)
display(df_raw_c.loc[:, tool_q].stack().str.split(";").explode().value_counts().to_frame())
print("\n\n")
df_raw_c.to_csv(paths.interim / "0_survey_raw_clean.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 26 columns):
 #   Column                                                               Non-Null Count  Dtype         
---  ------                                                               --------------  -----         
 0   ID                                                                   150 non-null    object        
 1   Start time                                                           150 non-null    datetime64[ns]
 2   Completion time                                                      150 non-null    datetime64[ns]
 3   Email                                                                150 non-null    object        
 4   Name                                                                 150 non-null    object        
 5   What is your organization?                                           150 non-null    object        
 6   What is your area of expertise?                                      150 non-null    object        
 7   How many years of experience do you have in your area of expertise?  150 non-null    object        
 8   What is your current role level?                                     150 non-null    object        
 9   What is your geographic region?                                      150 non-null    object        
 10  Which tools do you use regularly? (Select all that apply)            150 non-null    object        
 11  The documentation is clear and sufficient.                           150 non-null    object        
 12  The tool meets my day-to-day needs.                                  150 non-null    object        
 13  The UI is intuitive and easy to learn.                               150 non-null    object        
 14  Customer support is responsive.                                      150 non-null    object        
 15  The feature set covers my key workflows.                             150 non-null    object        
 16  Integration with my existing tools is seamless.                      150 non-null    object        
 17  System performance meets my expectations.                            150 non-null    object        
 18  I trust the product’s security and privacy.                          150 non-null    object        
 19  Training materials helped me get started quickly.                    150 non-null    object        
 20  The roadmap aligns with my needs over the next year.                 150 non-null    object        
 21  The product remains responsive during peak usage.                    150 non-null    object        
 22  Overall, I am satisfied with the product.                            150 non-null    object        
 23  Would you recommend this to a colleague?                             150 non-null    object        
 24  Consent to contact for follow-up?                                    150 non-null    object        
 25  Additional comments                                                  127 non-null    object        
dtypes: datetime64[ns](2), object(24)
memory usage: 30.6+ KB

None

What is your organization?
Globex                 18
Fabrikam               16
Other                  14
Acme Corp              14
Wayne Enterprises      12
Contoso                11
Vandelay Industries    10
Northwind Traders      10
Blue River Labs        10
Prefer not to say       9
Stark Industries        7
Initech                 7
Beta Analytics          6
Umbrella                6
Name: count, dtype: int64

What is your area of expertise?
Software Engineering            16
Other                           16
Research (Academic)             15
Program / Project Management    15
Bioinformatics                  12
UX / Human Factors              12
Mechanical Engineering          12
Product Management              12
Prefer not to say               10
Data Science / Analytics         9
Systems Engineering              8
Electrical Engineering           8
Operations / DevOps              5
Name: count, dtype: int64

What is your current role level?
Individual Contributor    23
Senior Manager            22
Senior IC                 22
VP                        22
Director                  21
Manager                   20
C-suite                   20
Name: count, dtype: int64

%matplotlib inline
# Survey Responses per day
daily = df_raw_c["Completion time"]
fig, axs = utils.plot_responses_over_time(daily)
daily.to_csv(paths.processed / "0_survey_responses_daily.csv")
fig.savefig(paths.figures / "survey-meta-responses-per-day.png")

# Average Completion Time
%matplotlib inline
df_t = df_raw_c[["ID", "Start time", "Completion time"]].set_index("ID")
df_t["survey time (min)"] = df_t["Completion time"] - df_t["Start time"]
df_t["survey time (min)"] = df_t["survey time (min)"].dt.total_seconds() / 60.0
utils.plot_completion_time(df_t["survey time (min)"])
df_t.to_csv(paths.processed / "1_survey_time.csv")
fig.savefig(paths.figures / "survey-meta-completion-time.png")

response_plot_colors = utils.get_plot_colors(paths.colors)
utils.visualize_colors(list(response_plot_colors.values()), list(response_plot_colors.keys()))

(<Figure size 400x2950 with 1 Axes>,
 <Axes: title={'center': 'Visualizing Colors'}>)

plt.close()
%matplotlib inline
df = utils.survey_ddict_combine(df_raw_c, df_ddict)
df.to_csv(paths.interim / "1_survey_data_cleaned.csv")
display(df.head(), df.describe())

	ID	Start time	Completion time	Email	Name	What is your organization?	What is your area of expertise?	How many years of experience do you have in your area of expertise?	What is your current role level?	What is your geographic region?	...	Integration with my existing tools is seamless.	System performance meets my expectations.	I trust the product’s security and privacy.	Training materials helped me get started quickly.	The roadmap aligns with my needs over the next year.	The product remains responsive during peak usage.	Overall, I am satisfied with the product.	Would you recommend this to a colleague?	Consent to contact for follow-up?	Additional comments
0	R-00001	2025-10-27 12:20:21	2025-10-27 12:42:01	reese.patel@globex.com	Reese Patel	Globex	Bioinformatics	2–5	Senior IC	North America	...	Agree	Agree	Strongly agree	Agree	Agree	Neutral	Agree	No	Yes	Small biotech startup
1	R-00002	2025-10-15 03:34:21	2025-10-15 03:48:45	parker.patel@starkindustries.com	Parker Patel	Stark Industries	Mechanical Engineering	0–1	Senior IC	Europe	...	Agree	Strongly agree	Agree	Strongly disagree	Strongly agree	Agree	Strongly agree	Yes	No	Great overall—UI feels snappy.
2	R-00003	2025-10-03 11:51:26	2025-10-03 12:07:36	drew.turnipshire@acme.com	Drew Turnipshire	Acme Corp	Data Science / Analytics	2–5	Manager	Middle East & Africa	...	Neutral	Disagree	Agree	Agree	Agree	Disagree	Agree	Not sure	No	Small biotech startup
3	R-00004	2025-10-22 15:05:01	2025-10-22 15:17:15	avery.gonzalez@betaanalytics.io	Avery Gonzalez	Beta Analytics	Research (Academic)	11–15	Senior IC	Europe	...	Strongly agree	Neutral	Strongly agree	Disagree	Strongly agree	Strongly agree	Strongly agree	Not sure	Yes	Docs could be deeper on integration examples.
4	R-00005	2025-10-08 03:14:59	2025-10-08 03:31:14	blake.martin@initech.com	Blake Martin	Initech	Software Engineering	16+	Senior IC	Latin America	...	Agree	Strongly agree	Strongly disagree	Neutral	Agree	Agree	Strongly agree	Not sure	Yes	Great overall—UI feels snappy.

	variable_name	question_index	category	question_text_or_description	type	allowed_values_or_codes	notes	example_value
0	ID	NaN	metadata	Record identifier	identifier (string)	Pattern: R-00001 …	Unique per response.	R-00001
1	Start time	NaN	metadata	Survey start timestamp	datetime (YYYY-MM-DD HH:MM:SS)	Format: YYYY-MM-DD HH:MM:SS (24h)	Local timezone when captured; ensure consisten...	2025-10-11 19:25:56
2	Completion time	NaN	metadata	Survey completion timestamp	datetime (YYYY-MM-DD HH:MM:SS)	Format: YYYY-MM-DD HH:MM:SS (24h)	Local timezone when captured; ensure consisten...	2025-10-11 19:35:17
3	Email	NaN	metadata	Respondent email address	email (string)	name.surname@organization-domain	NaN	morgan.turnipshire@vandelay-industries.example
4	Name	NaN	metadata	Respondent full name	name (string)	NaN	NaN	Morgan Turnipshire

	count
RStudio	42
Docker	42
Power BI	42
Git	41
MATLAB	40
VS Code	40
JupyterLab	40
Tableau	33
Excel	33
Other	32

								response
ID	organization	expertise	role	category	question_index	question	question_text_or_description
R-00001	Globex	Bioinformatics	Senior IC	metadata	NaN	Start time	Survey start timestamp	2025-10-27 12:20:21
						Completion time	Survey completion timestamp	2025-10-27 12:42:01
						Email	Respondent email address	reese.patel@globex.com
						Name	Respondent full name	Reese Patel
				demographics	Q1	What is your organization?	What is your organization?	Globex

	response
count	3962
unique	644
top	Agree
freq	710

Survey Analysis Pipeline (MS Forms → Clean Data → Plots)¶

Part 1 — Pre-Processing (Ingest + Clean + Restructure)¶

Pre-Processing¶

Input Data¶

Cleaning and Quality Checks¶

Value Counts¶

Timing Metadata¶

Plot Colors¶

Restructuring and Combine¶

Next¶