-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathstatistical_formats_example.py
More file actions
171 lines (135 loc) Β· 5.26 KB
/
statistical_formats_example.py
File metadata and controls
171 lines (135 loc) Β· 5.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
Statistical formats usage example.
This example demonstrates how to parse SPSS, Stata, and SAS files
and extract metadata for analysis.
"""
import tempfile
from pathlib import Path
import pandas as pd
from statqa.analysis.univariate import UnivariateAnalyzer
from statqa.interpretation.formatter import InsightFormatter
from statqa.metadata.parsers import StatisticalFormatParser
# Optional: only run if pyreadstat is available
try:
import pyreadstat
except ImportError:
print("This example requires pyreadstat. Install with: pip install statqa[statistical-formats]")
exit(1)
def create_sample_spss_file():
"""Create a sample SPSS file for demonstration."""
# Sample survey data
data = {
"respondent_id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"age": [25, 34, 45, 23, 56, 67, 29, 33, 41, 52],
"gender": [1, 2, 1, 2, 1, 1, 2, 1, 2, 1],
"education": [3, 4, 2, 4, 1, 2, 3, 4, 3, 2],
"satisfaction": [4, 5, 3, 2, 5, 4, 3, 4, 2, 5],
"income": [45000, 65000, 55000, 35000, 85000, 95000, 42000, 58000, 48000, 72000],
}
df = pd.DataFrame(data)
# Rich metadata for the SPSS file
variable_value_labels = {
"gender": {1: "Male", 2: "Female"},
"education": {
1: "High School or Less",
2: "Some College",
3: "Bachelor's Degree",
4: "Graduate Degree",
},
"satisfaction": {
1: "Very Dissatisfied",
2: "Dissatisfied",
3: "Neutral",
4: "Satisfied",
5: "Very Satisfied",
},
}
column_labels = {
"respondent_id": "Respondent ID",
"age": "Age in Years",
"gender": "Gender Identity",
"education": "Highest Education Level",
"satisfaction": "Job Satisfaction Level",
"income": "Annual Household Income (USD)",
}
# Create temporary SPSS file
with tempfile.NamedTemporaryFile(suffix=".sav", delete=False) as temp_file:
temp_path = temp_file.name
pyreadstat.write_sav(
df,
temp_path,
variable_value_labels=variable_value_labels,
column_labels=column_labels,
file_label="Sample Employee Satisfaction Survey",
)
return Path(temp_path)
def main():
"""Main demonstration function."""
print("π― StatQA Statistical Formats Example")
print("=" * 50)
# 1. Create sample SPSS file
print("\nπ Creating sample SPSS file...")
spss_file = create_sample_spss_file()
print(f"β Created: {spss_file}")
try:
# 2. Parse with StatisticalFormatParser
print("\nπ Parsing SPSS metadata...")
parser = StatisticalFormatParser()
# Validate file
if parser.validate(spss_file):
print("β File validation passed")
else:
print("β File validation failed")
return
# Parse the file
codebook = parser.parse(spss_file)
print(f"β Parsed codebook: {codebook.name}")
print(f" Variables: {len(codebook.variables)}")
print(f" Dataset info: {len(codebook.dataset_info)} metadata fields")
# 3. Explore extracted metadata
print("\nπ Variable Metadata:")
print("-" * 30)
for var_name, variable in codebook.variables.items():
print(f"\n{var_name.upper()}")
print(f" Label: {variable.label}")
print(f" Type: {variable.var_type}")
if variable.valid_values:
print(" Values:")
for code, label in variable.valid_values.items():
print(f" {code}: {label}")
# 4. Show dataset-level metadata
print("\nπ Dataset Information:")
print("-" * 30)
print(f"Rows: {codebook.dataset_info.get('number_rows')}")
print(f"Columns: {codebook.dataset_info.get('number_columns')}")
print(f"Encoding: {codebook.dataset_info.get('file_encoding')}")
print(f"Created: {codebook.dataset_info.get('creation_time')}")
# 5. Demonstrate integration with analysis pipeline
print("\n㪠Running Analysis Pipeline:")
print("-" * 30)
# Load the actual data for analysis
df, _metadata = pyreadstat.read_sav(spss_file)
# Analyze a variable
analyzer = UnivariateAnalyzer()
formatter = InsightFormatter()
# Analyze age variable
age_results = analyzer.analyze(df["age"], codebook.variables["age"])
age_insight = formatter.format_univariate(age_results)
print(f"Age Analysis: {age_insight}")
# Analyze satisfaction variable
satisfaction_results = analyzer.analyze(
df["satisfaction"], codebook.variables["satisfaction"]
)
satisfaction_insight = formatter.format_univariate(satisfaction_results)
print(f"Satisfaction Analysis: {satisfaction_insight}")
# 6. Show CLI equivalent
print("\nβ‘ CLI Equivalent:")
print("-" * 30)
print(f"statqa parse-codebook {spss_file.name} --output codebook.json")
print(f"statqa analyze {spss_file.name} codebook.json --output-dir results/")
print("\nβ
Example completed successfully!")
finally:
# Cleanup
spss_file.unlink(missing_ok=True)
if __name__ == "__main__":
main()