statqa/examples/statistical_formats_example.py at main · gojiplus/statqa · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
Statistical formats usage example.

This example demonstrates how to parse SPSS, Stata, and SAS files
and extract metadata for analysis.
"""

import tempfile
from pathlib import Path

import pandas as pd

from statqa.analysis.univariate import UnivariateAnalyzer
from statqa.interpretation.formatter import InsightFormatter
from statqa.metadata.parsers import StatisticalFormatParser


# Optional: only run if pyreadstat is available
try:
    import pyreadstat
except ImportError:
    print("This example requires pyreadstat. Install with: pip install statqa[statistical-formats]")
    exit(1)


def create_sample_spss_file():
    """Create a sample SPSS file for demonstration."""
    # Sample survey data
    data = {
        "respondent_id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        "age": [25, 34, 45, 23, 56, 67, 29, 33, 41, 52],
        "gender": [1, 2, 1, 2, 1, 1, 2, 1, 2, 1],
        "education": [3, 4, 2, 4, 1, 2, 3, 4, 3, 2],
        "satisfaction": [4, 5, 3, 2, 5, 4, 3, 4, 2, 5],
        "income": [45000, 65000, 55000, 35000, 85000, 95000, 42000, 58000, 48000, 72000],
    }

    df = pd.DataFrame(data)

    # Rich metadata for the SPSS file
    variable_value_labels = {
        "gender": {1: "Male", 2: "Female"},
        "education": {
            1: "High School or Less",
            2: "Some College",
            3: "Bachelor's Degree",
            4: "Graduate Degree",
        },
        "satisfaction": {
            1: "Very Dissatisfied",
            2: "Dissatisfied",
            3: "Neutral",
            4: "Satisfied",
            5: "Very Satisfied",
        },
    }

    column_labels = {
        "respondent_id": "Respondent ID",
        "age": "Age in Years",
        "gender": "Gender Identity",
        "education": "Highest Education Level",
        "satisfaction": "Job Satisfaction Level",
        "income": "Annual Household Income (USD)",
    }

    # Create temporary SPSS file
    with tempfile.NamedTemporaryFile(suffix=".sav", delete=False) as temp_file:
        temp_path = temp_file.name

    pyreadstat.write_sav(
        df,
        temp_path,
        variable_value_labels=variable_value_labels,
        column_labels=column_labels,
        file_label="Sample Employee Satisfaction Survey",
    )

    return Path(temp_path)


def main():
    """Main demonstration function."""
    print("🎯 StatQA Statistical Formats Example")
    print("=" * 50)

    # 1. Create sample SPSS file
    print("\n📊 Creating sample SPSS file...")
    spss_file = create_sample_spss_file()
    print(f"✓ Created: {spss_file}")

    try:
        # 2. Parse with StatisticalFormatParser
        print("\n📋 Parsing SPSS metadata...")
        parser = StatisticalFormatParser()

        # Validate file
        if parser.validate(spss_file):
            print("✓ File validation passed")
        else:
            print("❌ File validation failed")
            return

        # Parse the file
        codebook = parser.parse(spss_file)
        print(f"✓ Parsed codebook: {codebook.name}")
        print(f"  Variables: {len(codebook.variables)}")
        print(f"  Dataset info: {len(codebook.dataset_info)} metadata fields")

        # 3. Explore extracted metadata
        print("\n📝 Variable Metadata:")
        print("-" * 30)

        for var_name, variable in codebook.variables.items():
            print(f"\n{var_name.upper()}")
            print(f"  Label: {variable.label}")
            print(f"  Type: {variable.var_type}")

            if variable.valid_values:
                print("  Values:")
                for code, label in variable.valid_values.items():
                    print(f"    {code}: {label}")

        # 4. Show dataset-level metadata
        print("\n📊 Dataset Information:")
        print("-" * 30)
        print(f"Rows: {codebook.dataset_info.get('number_rows')}")
        print(f"Columns: {codebook.dataset_info.get('number_columns')}")
        print(f"Encoding: {codebook.dataset_info.get('file_encoding')}")
        print(f"Created: {codebook.dataset_info.get('creation_time')}")

        # 5. Demonstrate integration with analysis pipeline
        print("\n🔬 Running Analysis Pipeline:")
        print("-" * 30)

        # Load the actual data for analysis
        df, _metadata = pyreadstat.read_sav(spss_file)

        # Analyze a variable
        analyzer = UnivariateAnalyzer()
        formatter = InsightFormatter()

        # Analyze age variable
        age_results = analyzer.analyze(df["age"], codebook.variables["age"])
        age_insight = formatter.format_univariate(age_results)

        print(f"Age Analysis: {age_insight}")

        # Analyze satisfaction variable
        satisfaction_results = analyzer.analyze(
            df["satisfaction"], codebook.variables["satisfaction"]
        )
        satisfaction_insight = formatter.format_univariate(satisfaction_results)

        print(f"Satisfaction Analysis: {satisfaction_insight}")

        # 6. Show CLI equivalent
        print("\n⚡ CLI Equivalent:")
        print("-" * 30)
        print(f"statqa parse-codebook {spss_file.name} --output codebook.json")
        print(f"statqa analyze {spss_file.name} codebook.json --output-dir results/")

        print("\n✅ Example completed successfully!")

    finally:
        # Cleanup
        spss_file.unlink(missing_ok=True)


if __name__ == "__main__":
    main()