diff --git a/Makefile b/Makefile
index 5c328a492f..90538ea1a0 100644
--- a/Makefile
+++ b/Makefile
@@ -29,6 +29,7 @@ typing: ## Run typing checks
 	$(VENV_BIN)/uv pip install "pyarrow<24"
 	$(VENV_BIN)/uv run --no-sync pyright
 	$(VENV_BIN)/uv run --no-sync mypy
+	$(VENV_BIN)/uv run --no-sync pyrefly check
 
 .PHONY: docs-serve
 docs-serve:  # Build and serve the docs locally
diff --git a/pyproject.toml b/pyproject.toml
index b3bab1bf0e..44ebf0ee53 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,6 +79,7 @@ typing = [  # keep some of these pinned and bump periodically so there's fewer s
   "pandas-stubs==2.3.0.250703",
   "typing_extensions",
   "mypy~=1.15.0",
+  "pyrefly",
   "pyright==1.1.408",  # https://github.com/narwhals-dev/narwhals/issues/3584
   "pyarrow-stubs==19.2",
   "narwhals[dask]",
@@ -387,3 +388,17 @@ ignore = [
   "../../../**/Lib",      # stdlib
   "../../../**/typeshed*" # typeshed-fallback
 ]
+
+[tool.pyrefly]
+project-includes = ["tests"]
+ignore-missing-imports = [
+    "cudf.*",
+    "cupy.*",
+    "dask.*",
+    "dask_expr.*",
+    "joblib.*",
+    "modin.*",
+    "pyspark.*",
+    "sklearn.*",
+    "sqlparse.*",
+]
diff --git a/tests/conftest.py b/tests/conftest.py
index 3e80bcdff4..3fc3e91fa9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -244,7 +244,7 @@ def ibis_lazy_constructor(obj: Data) -> ibis.Table:  # pragma: no cover
     "cudf": cudf_constructor,
     "polars[eager]": polars_eager_constructor,
 }
-LAZY_CONSTRUCTORS: dict[str, ConstructorLazy] = {
+LAZY_CONSTRUCTORS: dict[str, ConstructorLazy] = {  # pyrefly: ignore[bad-assignment]
     "dask": dask_lazy_p2_constructor,
     "polars[lazy]": polars_lazy_constructor,
     "duckdb": duckdb_lazy_constructor,
diff --git a/tests/dependencies/is_into_dataframe_test.py b/tests/dependencies/is_into_dataframe_test.py
index 71978f04be..32b2251ad6 100644
--- a/tests/dependencies/is_into_dataframe_test.py
+++ b/tests/dependencies/is_into_dataframe_test.py
@@ -52,6 +52,6 @@ def test_is_into_dataframe_other() -> None:
     pytest.importorskip("numpy")
     import numpy as np
 
-    assert is_into_dataframe(DictDataFrame(DATA))
+    assert is_into_dataframe(DictDataFrame(DATA))  # pyrefly: ignore[bad-specialization]
     assert not is_into_dataframe(np.array([[1, 4], [2, 5], [3, 6]]))
     assert not is_into_dataframe(DATA)
diff --git a/tests/dependencies/is_into_series_test.py b/tests/dependencies/is_into_series_test.py
index 2d064835bc..8aa895343b 100644
--- a/tests/dependencies/is_into_series_test.py
+++ b/tests/dependencies/is_into_series_test.py
@@ -49,6 +49,6 @@ def test_is_into_series() -> None:
     pytest.importorskip("numpy")
     import numpy as np
 
-    assert is_into_series(ListBackedSeries("a", [1, 4, 2]))
+    assert is_into_series(ListBackedSeries("a", [1, 4, 2]))  # pyrefly: ignore[bad-specialization]
     assert not is_into_series(np.array([1, 2, 3]))
     assert not is_into_series([1, 2, 3])
diff --git a/tests/dtypes/pandas_extension_dtypes_test.py b/tests/dtypes/pandas_extension_dtypes_test.py
index bef74d75c8..43f06b49fa 100644
--- a/tests/dtypes/pandas_extension_dtypes_test.py
+++ b/tests/dtypes/pandas_extension_dtypes_test.py
@@ -25,7 +25,7 @@ class CustomInt16Dtype(pd.api.extensions.ExtensionDtype):  # pragma: no cover
 
     @classmethod
     def construct_array_type(cls) -> type[pd.api.extensions.ExtensionArray]:  # type: ignore[valid-type]
-        return CustomInt16Array
+        return CustomInt16Array  # pyrefly: ignore[bad-return]
 
 
 class CustomInt16Array(pd.api.extensions.ExtensionArray):  # pragma: no cover
@@ -56,7 +56,7 @@ class CustomInt32Dtype(pd.api.extensions.ExtensionDtype):  # pragma: no cover
 
     @classmethod
     def construct_array_type(cls) -> type[pd.api.extensions.ExtensionArray]:  # type: ignore[valid-type]
-        return CustomInt32Array
+        return CustomInt32Array  # pyrefly: ignore[bad-return]
 
     def __hash__(self) -> int:
         return hash(self.name)
diff --git a/tests/expr_and_series/lit_test.py b/tests/expr_and_series/lit_test.py
index 188292c0d1..7fab0ce2ec 100644
--- a/tests/expr_and_series/lit_test.py
+++ b/tests/expr_and_series/lit_test.py
@@ -46,7 +46,7 @@ def test_lit_error(constructor: Constructor) -> None:
     with pytest.raises(
         ValueError, match="numpy arrays are not supported as literal values"
     ):
-        _ = df.with_columns(nw.lit(np.array([1, 2])).alias("lit"))  # pyright: ignore[reportArgumentType]
+        _ = df.with_columns(nw.lit(np.array([1, 2])).alias("lit"))  # pyright: ignore[reportArgumentType]  # pyrefly: ignore[bad-argument-type]
 
 
 def test_lit_out_name(constructor: Constructor) -> None:
diff --git a/tests/frame/collect_test.py b/tests/frame/collect_test.py
index 66ea38e979..42d01d0a6f 100644
--- a/tests/frame/collect_test.py
+++ b/tests/frame/collect_test.py
@@ -149,7 +149,7 @@ def test_collect_with_kwargs(constructor: Constructor) -> None:
         if POLARS_VERSION > (1, 29, 0)
         else {"no_optimization": True}
     )
-    collect_kwargs = {
+    collect_kwargs: dict[nw.Implementation, Any] = {
         nw.Implementation.POLARS: pl_kwargs,
         nw.Implementation.DASK: {"optimize_graph": False},
         nw.Implementation.PYARROW: {},
diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py
index 42d52adafc..dc47ffb425 100644
--- a/tests/frame/join_test.py
+++ b/tests/frame/join_test.py
@@ -797,10 +797,10 @@ def test_join_duplicate_column_names(
     else:
         exception = nw.exceptions.DuplicateError
     if isinstance(df, nw.LazyFrame):
-        with pytest.raises(exception):
+        with pytest.raises(exception):  # pyrefly: ignore[unbound-name]
             df.join(df, on=["a"]).join(df, on=["a"]).collect()
     else:
-        with pytest.raises(exception):
+        with pytest.raises(exception):  # pyrefly: ignore[unbound-name]
             df.join(df, on=["a"]).join(df, on=["a"])
 
 
diff --git a/tests/frame/schema_test.py b/tests/frame/schema_test.py
index d90916b029..a4ee5d36a3 100644
--- a/tests/frame/schema_test.py
+++ b/tests/frame/schema_test.py
@@ -149,7 +149,7 @@ def test_dtypes() -> None:
         },
     )
     df_from_pl = nw.from_native(df_pl, eager_only=True)
-    expected = {
+    expected: dict[str, type[nw.dtypes.DType]] = {
         "a": nw.Int64,
         "b": nw.Int32,
         "c": nw.Int16,
diff --git a/tests/implementation_test.py b/tests/implementation_test.py
index d15a569a52..2075612d14 100644
--- a/tests/implementation_test.py
+++ b/tests/implementation_test.py
@@ -162,7 +162,7 @@ def test_pandas_typing(native: pd.DataFrame) -> None:
         # [False Positive]
         any_df.lazy(ldf_impl)
         # [True Negative]
-        any_df.lazy(ser_impl)  # pyright: ignore[reportArgumentType]
+        any_df.lazy(ser_impl)  # pyright: ignore[reportArgumentType]  # pyrefly: ignore[bad-argument-type]
         # [True Positive]
         any_ldf.collect(df_impl)
         any_ldf.collect(ldf_impl)
@@ -170,7 +170,7 @@ def test_pandas_typing(native: pd.DataFrame) -> None:
 
         assert_type(df_impl, _PandasImpl)
         # NOTE: Would require adding overloads to `DataFrame.lazy`
-        assert_type(ldf_impl, _PandasImpl)  # pyright: ignore[reportAssertTypeFailure]
+        assert_type(ldf_impl, _PandasImpl)  # pyright: ignore[reportAssertTypeFailure]  # pyrefly: ignore[assert-type]
         assert_type(ser_impl, _PandasImpl)
 
     def test_arrow_typing(native: pa.Table) -> None:
@@ -187,7 +187,7 @@ def test_arrow_typing(native: pa.Table) -> None:
         # [False Positive]
         any_df.lazy(ldf_impl)
         # [True Negative]
-        any_df.lazy(ser_impl)  # pyright: ignore[reportArgumentType]
+        any_df.lazy(ser_impl)  # pyright: ignore[reportArgumentType] # pyrefly: ignore[bad-argument-type]
         # [True Positive]
         any_ldf.collect(df_impl)
         any_ldf.collect(ldf_impl)
@@ -195,7 +195,7 @@ def test_arrow_typing(native: pa.Table) -> None:
 
         assert_type(df_impl, _ArrowImpl)
         # NOTE: Would require adding overloads to `DataFrame.lazy`
-        assert_type(ldf_impl, _ArrowImpl)  # pyright: ignore[reportAssertTypeFailure]
+        assert_type(ldf_impl, _ArrowImpl)  # pyright: ignore[reportAssertTypeFailure] # pyrefly: ignore[assert-type]
         assert_type(ser_impl, _ArrowImpl)
 
     def test_duckdb_typing(native: duckdb.DuckDBPyRelation) -> None:
@@ -218,7 +218,7 @@ def test_sqlframe_typing(native: BaseDataFrame[Any, Any, Any, Any, Any]) -> None
         # [True Positive]
         any_df.lazy(ldf_impl)
         # [True Negative]
-        any_ldf.collect(ldf_impl)  # pyright: ignore[reportArgumentType]
+        any_ldf.collect(ldf_impl)  # pyright: ignore[reportArgumentType] # pyrefly: ignore[bad-argument-type]
 
         assert_type(ldf.implementation, _SQLFrameImpl)
 
@@ -232,7 +232,7 @@ def test_ibis_typing(native: ibis.Table) -> None:
         # [True Negative]
         any_ldf.collect(ldf_impl)  # pyright: ignore[reportArgumentType]
 
-        assert_type(ldf.implementation, _IbisImpl)
+        assert_type(ldf.implementation, _IbisImpl)  # pyrefly: ignore[assert-type] (todo)
 
     def test_dask_typing(native: dd.DataFrame) -> None:
         ldf = nw.from_native(native)
@@ -242,7 +242,7 @@ def test_dask_typing(native: dd.DataFrame) -> None:
         # [True Positive]
         any_df.lazy(ldf_impl)
         # [True Negative]
-        any_ldf.collect(ldf_impl)  # pyright: ignore[reportArgumentType]
+        any_ldf.collect(ldf_impl)  # pyright: ignore[reportArgumentType] # pyrefly: ignore[bad-argument-type]
 
         assert_type(ldf.implementation, _DaskImpl)
 
@@ -255,10 +255,10 @@ def test_modin_typing(native: mpd.DataFrame) -> None:
         ser_impl = ser.implementation
 
         # [True Negative]
-        any_df.lazy(df_impl)  # pyright: ignore[reportArgumentType]
-        any_df.lazy(ser_impl)  # pyright: ignore[reportArgumentType]
-        any_ldf.collect(df_impl)  # pyright: ignore[reportArgumentType]
-        any_ldf.collect(ser_impl)  # pyright: ignore[reportArgumentType]
+        any_df.lazy(df_impl)  # pyright: ignore[reportArgumentType] # pyrefly: ignore[bad-argument-type]
+        any_df.lazy(ser_impl)  # pyright: ignore[reportArgumentType] # pyrefly: ignore[bad-argument-type]
+        any_ldf.collect(df_impl)  # pyright: ignore[reportArgumentType] # pyrefly: ignore[bad-argument-type]
+        any_ldf.collect(ser_impl)  # pyright: ignore[reportArgumentType] # pyrefly: ignore[bad-argument-type]
 
         assert_type(df_impl, _ModinImpl)
         assert_type(ser_impl, _ModinImpl)
@@ -276,9 +276,9 @@ def test_any_typing() -> None:
         any_ldf.collect(ldf_impl)
         any_ldf.collect(ser_impl)
 
-        assert_type(df_impl, _EagerAllowedImpl)  # pyright: ignore[reportAssertTypeFailure]
-        assert_type(ldf_impl, _LazyAllowedImpl)  # pyright: ignore[reportAssertTypeFailure]
-        assert_type(ser_impl, _EagerAllowedImpl)  # pyright: ignore[reportAssertTypeFailure]
+        assert_type(df_impl, _EagerAllowedImpl)  # pyright: ignore[reportAssertTypeFailure] # pyrefly: ignore[assert-type]
+        assert_type(ldf_impl, _LazyAllowedImpl)  # pyright: ignore[reportAssertTypeFailure] # pyrefly: ignore[assert-type]
+        assert_type(ser_impl, _EagerAllowedImpl)  # pyright: ignore[reportAssertTypeFailure] # pyrefly: ignore[assert-type]
         # Fallback, matches the first overload `_PolarsImpl`
         assert_type(df_impl, _PolarsImpl)
         assert_type(ldf_impl, _PolarsImpl)
diff --git a/tests/namespace_test.py b/tests/namespace_test.py
index e94a6690c1..34d0d60204 100644
--- a/tests/namespace_test.py
+++ b/tests/namespace_test.py
@@ -91,7 +91,8 @@ def test_namespace_from_backend_typing(backend: _EagerAllowed) -> None:
     if TYPE_CHECKING:
         assert_type(
             namespace,
-            "Namespace[PolarsNamespace] | Namespace[PandasLikeNamespace] | Namespace[ArrowNamespace]",
+            # pyrefly: `PolarsNamespace` is not assignable to upper bound `CompliantNamespace` of type variable `CompliantNamespaceT_co`
+            "Namespace[PolarsNamespace] | Namespace[PandasLikeNamespace] | Namespace[ArrowNamespace]",  # pyrefly: ignore[bad-specialization]
         )
     assert repr(namespace) in {
         "Namespace[PolarsNamespace]",
@@ -188,7 +189,7 @@ def test_namespace_is_native() -> None:
     native_2 = pl.DataFrame({"a": unrelated})
 
     maybe_native: list[pl.Series | list[int]] = [native_1, unrelated]
-    always_native = list["pl.DataFrame | pl.Series"]((native_2, native_1))
+    always_native = list["pl.DataFrame | pl.Series"]((native_2, native_1))  # pyrefly: ignore[not-a-type] https://github.com/facebook/pyrefly/issues/3193
     never_native = [unrelated, 50]
 
     expected_maybe = [True, False]
@@ -210,7 +211,7 @@ def test_namespace_is_native() -> None:
             # NOTE: We can't spell intersections *yet* (https://github.com/python/typing/issues/213)
             # Would be:
             # `<subclass of list[int] and DataFrame> | <subclass of list[int] and LazyFrame> | <subclass of list[int] and Series>``
-            assert_type(unrelated, "Never")  # pyright: ignore[reportAssertTypeFailure]
+            assert_type(unrelated, "Never")  # pyright: ignore[reportAssertTypeFailure] # pyrefly: ignore[assert-type]
         else:
             assert_type(unrelated, "list[int]")
 
@@ -227,14 +228,15 @@ def test_namespace_is_native() -> None:
             assert_type(native_2, "Never")
 
         always_item = always_native[1]
-        assert_type(always_item, "pl.DataFrame | pl.Series")
+        # pyrefly: `always_item` is `Unknown`
+        assert_type(always_item, "pl.DataFrame | pl.Series")  # pyrefly: ignore[assert-type] (todo)
         if ns.is_native(always_item):
-            assert_type(always_item, "pl.DataFrame | pl.Series")
+            assert_type(always_item, "pl.DataFrame | pl.Series")  # pyrefly: ignore[assert-type] (todo)
             if ns._dataframe._is_native(always_item):
                 assert_type(always_item, "pl.DataFrame")
             elif ns._series._is_native(always_item):
                 assert_type(always_item, "pl.Series")
             else:
-                assert_type(always_item, "Never")
+                assert_type(always_item, "Never")  # pyrefly: ignore[assert-type] (todo)
         else:
-            assert_type(always_item, "Never")
+            assert_type(always_item, "Never")  # pyrefly: ignore[assert-type] (todo)
diff --git a/tests/translate/from_native_test.py b/tests/translate/from_native_test.py
index 8d076699c0..9f87b5220b 100644
--- a/tests/translate/from_native_test.py
+++ b/tests/translate/from_native_test.py
@@ -297,7 +297,7 @@ def test_series_only_sqlframe() -> None:  # pragma: no cover
     df = sqlframe_pyspark_lazy_constructor(data)
 
     with pytest.raises(TypeError, match="Cannot only use `series_only`"):
-        nw.from_native(df, series_only=True)  # pyright: ignore[reportArgumentType, reportCallIssue]
+        nw.from_native(df, series_only=True)  # pyright: ignore[reportArgumentType, reportCallIssue]  # pyrefly: ignore[no-matching-overload]
 
 
 @pytest.mark.parametrize(
diff --git a/tests/typing_compat_test.py b/tests/typing_compat_test.py
index ffb026e8dc..35146595d4 100644
--- a/tests/typing_compat_test.py
+++ b/tests/typing_compat_test.py
@@ -18,7 +18,7 @@ def test_assert_never() -> None:
     some: Literal["a"] = "a"
     if some != "a":
         assigned = "b"
-        assert_never(assigned)
+        assert_never(assigned)  # pyrefly: ignore[bad-argument-type] https://github.com/facebook/pyrefly/issues/3202
     else:
         assigned = some
     if not TYPE_CHECKING:
diff --git a/tests/utils.py b/tests/utils.py
index 4d01223b2a..b9fa613bca 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -194,7 +194,8 @@ def pyspark_session() -> SparkSession:  # pragma: no cover
         else builder.master("local[1]").config("spark.ui.enabled", "false")
     )
     return (
-        builder.config("spark.default.parallelism", "1")
+        # Don't remove pyrefly-ignore, needed in CI when pyspark is installed.
+        builder.config("spark.default.parallelism", "1")  # pyrefly: ignore[bad-return]
         .config("spark.sql.shuffle.partitions", "2")
         .config("spark.sql.session.timeZone", "UTC")
         .getOrCreate()
diff --git a/tests/utils_test.py b/tests/utils_test.py
index c8be4131fb..862d5c3c01 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -416,7 +416,7 @@ def str(self) -> PolarsExprStringNamespace:  # type: ignore[override]
             pl_expr = cast("PolarsExpr", self)
             return PolarsExprStringNamespace(pl_expr)
 
-        dt = not_implemented()
+        dt: Any = not_implemented()
 
         # NOTE: Typing is happy w/ double property
         @property