diff --git a/src/openroad_mcp/core/manager.py b/src/openroad_mcp/core/manager.py
index 18bd1af..d8b8dc3 100644
--- a/src/openroad_mcp/core/manager.py
+++ b/src/openroad_mcp/core/manager.py
@@ -94,6 +94,9 @@ async def execute_command(
         actual_timeout = timeout_ms or self._default_timeout_ms
 
         try:
+            # Discard any output buffered before this command (e.g. startup banner)
+            # so that read_output only captures the response to this specific command.
+            await session.output_buffer.drain_all()
             await session.send_command(command)
             result = await session.read_output(actual_timeout)
 
diff --git a/src/openroad_mcp/interactive/session.py b/src/openroad_mcp/interactive/session.py
index ad924cd..ca01b7f 100644
--- a/src/openroad_mcp/interactive/session.py
+++ b/src/openroad_mcp/interactive/session.py
@@ -163,9 +163,6 @@ async def _start_background_tasks(self) -> None:
         self._writer_task = asyncio.create_task(self._write_input())
         self._exit_monitor_task = asyncio.create_task(self._monitor_exit())
 
-        # Wait for background tasks to start and initial output to be available
-        # await self._wait_for_startup_ready()
-
     async def _wait_for_startup_ready(self, timeout: float = 2.0) -> None:
         """Wait for background tasks to be ready and initial output to be available."""
         logger.info(f"Session {self.session_id} waiting for startup readiness (timeout={timeout}s)")
diff --git a/tests/performance/test_benchmarks.py b/tests/performance/test_benchmarks.py
index df1823a..ed12060 100644
--- a/tests/performance/test_benchmarks.py
+++ b/tests/performance/test_benchmarks.py
@@ -2,6 +2,7 @@
 
 import asyncio
 import math
+import os
 import time
 from unittest.mock import AsyncMock, patch
 
@@ -89,7 +90,7 @@ async def test_output_streaming_throughput(self, benchmark_timeout):
         assert duration < 5.0, f"Streaming took {duration:.3f}s (>5s timeout)"
 
     async def test_concurrent_session_scalability(self, benchmark_timeout):
-        """Test concurrent session scalability with 50+ sessions and p99/p95 latency metrics."""
+        """Test concurrent session scalability with 50+ sessions using real PTY calls."""
         session_manager = SessionManager()
 
         try:
@@ -99,7 +100,7 @@ async def test_concurrent_session_scalability(self, benchmark_timeout):
 
             start_time = time.perf_counter()
 
-            # Create sessions concurrently
+            # Create sessions concurrently using real openroad PTY calls
             async def create_session_with_delay():
                 await asyncio.sleep(0.001)  # Small delay to simulate real usage
                 return await session_manager.create_session()
@@ -114,33 +115,31 @@ async def create_session_with_delay():
             print(f"  Duration: {creation_time:.3f}s")
             print(f"  Rate: {len(session_ids) / creation_time:.1f} sessions/sec")
 
-            # Verify all sessions created successfully
+            # Verify all sessions created successfully with unique IDs (no cross-pollution)
             assert len(session_ids) == concurrent_sessions
             assert len(set(session_ids)) == concurrent_sessions  # All unique IDs, no cross-pollution
 
             # Performance assertions
             assert creation_time < 10.0, f"Concurrent creation took {creation_time:.3f}s (>10s)"
 
-            # Test concurrent command execution with per-command latency tracking
+            # Test concurrent command execution via real PTY with per-command latency tracking.
             command_latencies = []
 
-            with (
-                patch("openroad_mcp.interactive.session.InteractiveSession.send_command"),
-                patch("openroad_mcp.interactive.session.InteractiveSession.read_output") as mock_read,
-            ):
-                mock_read.return_value = AsyncMock()
-                mock_read.return_value.output = "test output"
-                mock_read.return_value.execution_time = 0.01
+            async def execute_with_latency(sid):
+                t0 = time.perf_counter()
+                result = await session_manager.execute_command(sid, "puts hello")
+                latency = time.perf_counter() - t0
+                command_latencies.append(latency)
+                return sid, result
 
-                async def execute_with_latency(session_id):
-                    t0 = time.perf_counter()
-                    result = await session_manager.execute_command(session_id, "test command")
-                    latency = time.perf_counter() - t0
-                    command_latencies.append(latency)
-                    return result
+            tasks = [execute_with_latency(sid) for sid in session_ids]
+            results = await asyncio.gather(*tasks)
 
-                tasks = [execute_with_latency(sid) for sid in session_ids]
-                await asyncio.gather(*tasks)
+            # Verify output content and session binding (no cross-pollution)
+            for sid, result in results:
+                assert result is not None, f"Session {sid} returned no result"
+                output = result.output if hasattr(result, "output") else str(result)
+                assert "hello" in output, f"Session {sid} output missing 'hello': {output!r}"
 
             # Calculate p99, p95, mean latency
             if not command_latencies:
@@ -158,16 +157,15 @@ async def execute_with_latency(session_id):
             print(f"  p99 latency:  {p99_latency * 1000:.2f}ms")
 
             # Latency assertions under 50-session concurrency
-            assert mean_latency < 0.05, f"Mean latency {mean_latency * 1000:.2f}ms exceeds 50ms"
-            assert p95_latency < 0.10, f"p95 latency {p95_latency * 1000:.2f}ms exceeds 100ms"
-            assert p99_latency < 0.20, f"p99 latency {p99_latency * 1000:.2f}ms exceeds 200ms"
+            assert mean_latency < 1.0, f"Mean latency {mean_latency * 1000:.2f}ms exceeds 1000ms"
+            assert p95_latency < 2.0, f"p95 latency {p95_latency * 1000:.2f}ms exceeds 2000ms"
+            assert p99_latency < 3.0, f"p99 latency {p99_latency * 1000:.2f}ms exceeds 3000ms"
 
         finally:
             await session_manager.cleanup_all()
 
     async def test_memory_usage_profiling(self, benchmark_timeout):
         """Test memory usage profiling."""
-        import os
 
         import psutil