fix: remove pytest.mark.asyncio and use llm-as-judge for memory integration tests

+1

CLAUDE.md

··· 8 8 - 3.10+ and complete typing (T | None preferred over Optional[T] and list[T] over typing.List[T]) 9 9 - use prefer functional over OOP 10 10 - keep implementation details private and functions pure 11 + - never use `pytest.mark.asyncio`, its unnecessary 11 12 12 13 ## Project Structure 13 14

-7

evals/test_basic_responses.py

··· 1 1 """Test phi's basic response behavior.""" 2 2 3 - import pytest 4 - 5 3 from bot.agent import Response 6 4 7 5 8 - @pytest.mark.asyncio 9 6 async def test_phi_responds_to_philosophical_question(phi_agent, evaluate_response): 10 7 """Test that phi engages meaningfully with philosophical questions.""" 11 8 agent = phi_agent ··· 39 36 ) 40 37 41 38 42 - @pytest.mark.asyncio 43 39 async def test_phi_ignores_spam(phi_agent): 44 40 """Test that phi appropriately ignores spam-like content.""" 45 41 agent = phi_agent ··· 57 53 assert response.reason is not None 58 54 59 55 60 - @pytest.mark.asyncio 61 56 async def test_phi_maintains_thread_context(phi_agent, evaluate_response): 62 57 """Test that phi uses thread context appropriately.""" 63 58 agent = phi_agent ··· 89 84 ) 90 85 91 86 92 - @pytest.mark.asyncio 93 87 async def test_phi_respects_character_limit(phi_agent): 94 88 """Test that phi's responses fit Bluesky's 300 character limit.""" 95 89 agent = phi_agent ··· 108 102 ) 109 103 110 104 111 - @pytest.mark.asyncio 112 105 async def test_phi_handles_casual_greeting(phi_agent, evaluate_response): 113 106 """Test that phi responds appropriately to casual greetings.""" 114 107 agent = phi_agent

+46 -50

evals/test_memory_integration.py

··· 2 2 3 3 import pytest 4 4 5 - from bot.agent import PhiAgent 6 5 from bot.config import Settings 7 6 from bot.memory import MemoryType, NamespaceMemory 8 7 9 8 10 - @pytest.mark.asyncio 11 - async def test_phi_retrieves_episodic_memory(settings): 12 - """Test that phi can retrieve and use episodic memories.""" 9 + @pytest.fixture 10 + def memory_settings(): 11 + """Check if memory keys are available.""" 12 + settings = Settings() 13 13 if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]): 14 - pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env") 14 + pytest.skip("Requires TURBOPUFFER_API_KEY, OPENAI_API_KEY, and ANTHROPIC_API_KEY") 15 + return settings 15 16 16 - # Create memory system 17 - memory = NamespaceMemory(api_key=settings.turbopuffer_api_key) 18 17 19 - # Store a memory about a user 20 - await memory.store_user_memory( 21 - "alice.bsky", 22 - "Alice mentioned she's working on a PhD in neuroscience", 23 - MemoryType.USER_FACT, 18 + async def test_core_memory_integration(memory_settings, phi_agent, evaluate_response): 19 + """Test that phi uses core memories in responses.""" 20 + memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key) 21 + 22 + # Store a core memory 23 + await memory.store_core_memory( 24 + label="test_interaction_rule", 25 + content="When users mention birds, always acknowledge the beauty of murmuration patterns", 26 + memory_type=MemoryType.GUIDELINE, 24 27 ) 25 28 26 - # Create agent 27 - agent = PhiAgent() 28 - agent.memory = memory 29 + # Override agent's memory with our test memory 30 + phi_agent.memory = memory 29 31 30 - # Process a mention that should trigger memory retrieval 31 - response = await agent.process_mention( 32 - mention_text="what do you remember about me?", 33 - author_handle="alice.bsky", 32 + # Ask about birds 33 + response = await phi_agent.process_mention( 34 + mention_text="I saw a huge flock of starlings today", 35 + author_handle="test.user", 34 36 thread_context="No previous messages in this thread.", 35 - thread_uri="at://test/thread/memory1", 37 + thread_uri="at://test/thread/1", 36 38 ) 37 39 38 40 if response.action == "reply": 39 - assert response.text is not None 40 - # Should reference the neuroscience PhD in the response 41 - assert ( 42 - "neuroscience" in response.text.lower() 43 - or "phd" in response.text.lower() 44 - or "working on" in response.text.lower() 45 - ), "Response should reference stored memory about Alice" 41 + await evaluate_response( 42 + "Does the response acknowledge or reference murmuration patterns?", 43 + response.text, 44 + ) 46 45 47 46 48 - @pytest.mark.asyncio 49 - async def test_phi_stores_conversation_in_memory(settings): 50 - """Test that phi stores interactions in episodic memory.""" 51 - if not all([settings.turbopuffer_api_key, settings.openai_api_key, settings.anthropic_api_key]): 52 - pytest.skip("Requires TurboPuffer, OpenAI, and Anthropic API keys in .env") 47 + async def test_user_memory_integration(memory_settings, phi_agent, evaluate_response): 48 + """Test that phi uses user-specific memories in responses.""" 49 + memory = NamespaceMemory(api_key=memory_settings.turbopuffer_api_key) 53 50 54 - memory = NamespaceMemory(api_key=settings.turbopuffer_api_key) 51 + # Store a memory about a user 52 + await memory.store_user_memory( 53 + handle="alice.test", 54 + content="Alice is researching swarm intelligence in biological systems", 55 + memory_type=MemoryType.USER_FACT, 56 + ) 55 57 56 - agent = PhiAgent() 57 - agent.memory = memory 58 + # Override agent's memory 59 + phi_agent.memory = memory 58 60 59 - # Have a conversation 60 - response = await agent.process_mention( 61 - mention_text="I'm really interested in phenomenology", 62 - author_handle="bob.bsky", 61 + # User asks a question 62 + response = await phi_agent.process_mention( 63 + mention_text="what do you remember about my research?", 64 + author_handle="alice.test", 63 65 thread_context="No previous messages in this thread.", 64 - thread_uri="at://test/thread/memory2", 66 + thread_uri="at://test/thread/2", 65 67 ) 66 68 67 69 if response.action == "reply": 68 - # Verify memories were stored 69 - memories = await memory.get_user_memories("bob.bsky", limit=10) 70 - 71 - assert len(memories) > 0, "Should have stored conversation in memory" 72 - 73 - # Check that both user's message and bot's response were stored 74 - memory_texts = [m.content for m in memories] 75 - assert any( 76 - "phenomenology" in text.lower() for text in memory_texts 77 - ), "Should store user's message about phenomenology" 70 + await evaluate_response( 71 + "Does the response reference Alice's research on swarm intelligence or biological systems?", 72 + response.text, 73 + )