Testing Documentation

Comprehensive guide to testing EiplGrader components and features.

Overview

EiplGrader uses a multi-layered testing approach:

Unit Tests: Test individual components in isolation
Integration Tests: Test component interactions
Language Tests: Test language-specific features
End-to-End Tests: Test complete workflows
Performance Tests: Test execution speed and resource usage

Test Structure

tests/
├── unit/                    # Unit tests
│   ├── test_adapters/      # Language adapter tests
│   ├── test_executors/     # Language executor tests
│   ├── test_codegen.py     # CodeGenerator tests
│   └── test_tester.py      # CodeTester tests
├── integration/            # Integration tests
│   ├── test_generation/    # Code generation tests
│   ├── test_execution/     # Code execution tests
│   └── test_languages/     # Language-specific tests
├── edge_cases/            # Edge case testing
│   ├── test_error_scenarios/
│   └── test_limits/
├── performance/           # Performance benchmarks
└── conftest.py           # Pytest configuration

Running Tests

Basic Commands

# Run all tests
python -m pytest

# Run with coverage
python -m pytest --cov=eiplgrader tests/

# Run specific test file
python -m pytest tests/unit/test_codegen.py

# Run specific test
python -m pytest tests/unit/test_codegen.py::test_generate_code

# Run tests for specific language
python -m pytest -k "python"

# Run with verbose output
python -m pytest -v

# Run with concurrent processing
python -m pytest -n auto

Test Categories

# Unit tests only
python -m pytest tests/unit/

# Integration tests only  
python -m pytest tests/integration/

# Edge cases
python -m pytest tests/edge_cases/

# Performance tests
python -m pytest tests/performance/ --benchmark-only

Writing Tests

Unit Test Template

import pytest
from unittest.mock import Mock, patch
from eiplgrader.codegen import CodeGenerator

class TestCodeGenerator:
    """Unit tests for CodeGenerator."""
    
    def setup_method(self):
        """Set up test fixtures."""
        self.api_key = "test-key"
        self.generator = CodeGenerator(self.api_key, client_type="openai")
    
    def test_initialization(self):
        """Test proper initialization."""
        assert self.generator.api_key == self.api_key
        assert self.generator.language == "python"
        assert self.generator.client_type == "openai"
    
    @patch('eiplgrader.codegen.OpenAIRequest')
    def test_generate_code(self, mock_openai):
        """Test code generation."""
        # Mock LLM response
        mock_openai.return_value.request_function_generation.return_value = {
            "choices": [{
                "message": {
                    "content": "def factorial(n):\n    return 1"
                }
            }]
        }
        
        # Generate code
        result = self.generator.generate_code(
            student_response="calculate factorial",
            function_name="factorial"
        )
        
        # Assertions
        assert len(result["code"]) == 1
        assert "factorial" in result["code"][0]
        mock_openai.return_value.request_function_generation.assert_called_once()
    
    def test_invalid_client_type(self):
        """Test error handling for invalid client type."""
        with pytest.raises(ValueError):
            CodeGenerator(self.api_key, client_type="invalid_client")
    
    @pytest.mark.parametrize("gen_type,expected", [
        ("cgbg", True),
        ("redef", True),
        ("invalid", False)
    ])
    def test_generation_types(self, gen_type, expected):
        """Test different generation types."""
        if expected:
            # Should not raise
            self.generator._validate_gen_type(gen_type)
        else:
            with pytest.raises(ValueError):
                self.generator._validate_gen_type(gen_type)

Integration Test Template

class TestCodeGenerationIntegration:
    """Integration tests for code generation and testing."""
    
    @pytest.fixture
    def generator(self):
        """Create generator fixture."""
        return CodeGenerator(os.getenv("TEST_API_KEY", "dummy"))
    
    @pytest.fixture
    def test_cases(self):
        """Create test case fixture."""
        return [
            {"parameters": {"n": 5}, "expected": 120},
            {"parameters": {"n": 0}, "expected": 1}
        ]
    
    def test_generate_and_test_python(self, generator, test_cases):
        """Test Python code generation and execution."""
        # Generate code
        result = generator.generate_code(
            student_response="calculate factorial recursively",
            function_name="factorial",
            language="python"
        )
        
        assert result.success
        assert len(result.codes) > 0
        
        # Test generated code
        for code in result.codes:
            tester = CodeTester(
                code=code,
                test_cases=test_cases,
                function_name="factorial",
                language="python"
            )
            
            results = tester.run_tests()
            assert results.was_successful(), f"Tests failed for code:
{code}"

Language-Specific Tests

class TestJavaLanguageSupport:
    """Test Java language support."""
    
    def test_java_type_requirements(self):
        """Test that Java requires explicit types."""
        executor = JavaExecutor()
        
        # Missing types should raise error
        test_case = {
            "parameters": {"x": 5},
            "expected": 10
        }
        
        with pytest.raises(ValueError, match="parameter_types"):
            executor.validate_types_provided(test_case)
    
    def test_java_value_formatting(self):
        """Test Java value formatting."""
        executor = JavaExecutor()
        
        # Test various type conversions
        assert executor.format_value(True, "boolean") == "true"
        assert executor.format_value("hello", "String") == '"hello"'
        assert executor.format_value([1, 2, 3], "int[]") == "new int[]{1, 2, 3}"
    
    def test_java_compilation(self):
        """Test Java code compilation."""
        code = """
        public class Solution {
            public int add(int a, int b) {
                return a + b;
            }
        }
        """
        
        test_case = {
            "parameters": {"a": 5, "b": 3},
            "parameter_types": {"a": "int", "b": "int"},
            "expected": 8,
            "expected_type": "int"
        }
        
        executor = JavaExecutor()
        result = executor.execute_test(code, test_case)
        
        assert result["passed"]
        assert result["actual"] == 8

Edge Case Tests

class TestEdgeCases:
    """Test edge cases and error conditions."""
    
    def test_empty_code(self):
        """Test handling of empty code."""
        tester = CodeTester(
            code="",
            test_cases=[{"parameters": {"x": 1}, "expected": 1}],
            function_name="test"
        )
        
        results = tester.run_tests()
        assert not results.was_successful()
        assert any("runtime" in str(r.get("error", "")).lower() 
                  for r in results.test_results if not r["pass"])
    
    def test_infinite_loop(self):
        """Test timeout handling."""
        code = """
def infinite():
    while True:
        pass
"""
        
        tester = CodeTester(
            code=code,
            test_cases=[{"parameters": {}, "expected": None}],
            function_name="infinite",
            timeout=1
        )
        
        results = tester.run_tests()
        assert not results.was_successful()
        assert any("timeout" in str(r.get("error", "")).lower() 
                  for r in results.test_results if not r["pass"])
    
    def test_large_output(self):
        """Test output size limits."""
        code = """
def large_output():
    return "x" * 10000000  # 10MB string
"""
        
        tester = CodeTester(
            code=code,
            test_cases=[{"parameters": {}, "expected": "x"}],
            function_name="large_output",

        )
        
        results = tester.run_tests()
        assert not results.was_successful()
        assert any("output size" in str(r.get("error", "")).lower() 
                  for r in results.test_results if not r["pass"])

Test Fixtures

Common Fixtures

# conftest.py
import pytest
import tempfile
import shutil

@pytest.fixture
def temp_dir():
    """Create temporary directory for tests."""
    temp = tempfile.mkdtemp()
    yield temp
    shutil.rmtree(temp, ignore_errors=True)

@pytest.fixture
def sample_code():
    """Provide sample code for testing."""
    return {
        "python": """
def factorial(n):
    if n <= 1:
        return 1
    return n * factorial(n - 1)
""",
        "java": """
public class Solution {
    public int factorial(int n) {
        if (n <= 1) return 1;
        return n * factorial(n - 1);
    }
}
""",
        "javascript": """
function factorial(n) {
    if (n <= 1) return 1;
    return n * factorial(n - 1);
}
"""
    }

@pytest.fixture
def api_key():
    """Get API key for testing."""
    key = os.getenv("OPENAI_API_KEY")
    if not key:
        pytest.skip("API key not available")
    return key

Language-Specific Fixtures

@pytest.fixture(params=["python", "javascript"])
def dynamic_language(request):
    """Parametrized fixture for dynamic languages."""
    return request.param

@pytest.fixture(params=["java", "cpp", "go"])
def static_language(request):
    """Parametrized fixture for static languages."""
    return request.param

@pytest.fixture
def language_test_cases():
    """Test cases for each language."""
    return {
        "python": {
            "simple": {
                "parameters": {"x": 5},
                "expected": 10
            }
        },
        "java": {
            "simple": {
                "parameters": {"x": 5},
                "parameter_types": {"x": "int"},
                "expected": 10,
                "expected_type": "int"
            }
        }
    }

Test Utilities

Test Case Builders

def create_test_case(language: str, params: dict, expected: Any) -> dict:
    """Create test case with appropriate format for language."""
    test_case = {
        "parameters": params,
        "expected": expected
    }
    
    # Add types for static languages
    if language in ["java", "cpp", "c", "go", "haskell"]:
        test_case["parameter_types"] = infer_types(params, language)
        test_case["expected_type"] = infer_type(expected, language)
    
    return test_case

def infer_types(params: dict, language: str) -> dict:
    """Infer parameter types for a language."""
    type_map = get_type_mapping(language)
    return {
        name: type_map[type(value).__name__]
        for name, value in params.items()
    }

Mock Helpers

class MockLLMResponse:
    """Mock LLM response for testing."""
    
    def __init__(self, codes: List[str]):
        self.codes = codes
        self.call_count = 0
    
    def __call__(self, *args, **kwargs):
        """Return next code in sequence."""
        if self.call_count < len(self.codes):
            code = self.codes[self.call_count]
            self.call_count += 1
            return {"choices": [{"message": {"content": code}}]}
        raise ValueError("No more mock responses")

def mock_llm_response(monkeypatch, codes: List[str]):
    """Patch LLM calls with mock responses."""
    mock = MockLLMResponse(codes)
    monkeypatch.setattr(
        "eiplgrader.codegen.OpenAIRequest.request_function_generation",
        mock
    )
    return mock

Performance Testing

Benchmark Tests

import pytest

@pytest.mark.benchmark
def test_code_generation_performance(benchmark):
    """Benchmark code generation."""
    generator = CodeGenerator("key")
    
    result = benchmark(
        generator.generate_code,
        student_response="calculate factorial",
        function_name="factorial",
        num_to_gen=1
    )
    
    assert result.success

@pytest.mark.benchmark
def test_execution_performance(benchmark):
    """Benchmark code execution."""
    code = "def add(a, b): return a + b"
    test_case = {"parameters": {"a": 1, "b": 2}, "expected": 3}
    
    tester = CodeTester(
        code=code,
        test_cases=[test_case],
        function_name="add"
    )
    
    # Individual test benchmarking not available - use run_tests() instead
    result = benchmark(lambda: tester.run_tests(), None)
    assert result.passed

Load Testing

def test_concurrent_execution_load():
    """Test system under load."""
    import concurrent.futures
    
    # Create many test cases
    test_cases = [
        {"parameters": {"n": i}, "expected": factorial(i)}
        for i in range(100)
    ]
    
    tester = CodeTester(
        code="def factorial(n): ...",  # Implementation
        test_cases=test_cases,
        function_name="factorial"
    )
    
    # Run with different worker counts
    for workers in [1, 4, 8, 16]:
        start = time.time()
        results = tester.run_tests()
        duration = time.time() - start
        
        print(f"Workers: {workers}, Time: {duration:.2f}s")
        assert results.was_successful()

Test Coverage

Coverage Configuration

# .coveragerc
[run]
source = eiplgrader
omit = 
    */tests/*
    */test_*
    */__pycache__/*
    */venv/*

[report]
exclude_lines =
    pragma: no cover
    def __repr__
    raise AssertionError
    raise NotImplementedError
    if __name__ == .__main__.:
    @abstractmethod

Coverage Commands

# Run with coverage
python -m pytest --cov=eiplgrader --cov-report=html

# View coverage report
open htmlcov/index.html

# Check coverage threshold
python -m pytest --cov=eiplgrader --cov-fail-under=80

Continuous Integration

GitHub Actions Example

# .github/workflows/tests.yml
name: Tests

on: [push, pull_request]

jobs:
  test:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: [3.8, 3.9, "3.10", "3.11"]
        
    steps:
    - uses: actions/checkout@v3
    
    - name: Set up Python
      uses: actions/setup-python@v4
      with:
        python-version: $
    
    - name: Install dependencies
      run: |
        pip install -e ".[dev]"
        
    - name: Run linting
      run: |
        black --check eiplgrader tests
        pylint eiplgrader
        mypy eiplgrader
    
    - name: Run tests
      run: |
        python -m pytest --cov=eiplgrader --cov-report=xml
    
    - name: Upload coverage
      uses: codecov/codecov-action@v3

Test Best Practices

1. Test Organization

One test class per component
Group related tests together
Use descriptive test names
Keep tests focused and simple

2. Test Independence

Each test should be independent
Use fixtures for setup/teardown
Don’t rely on test execution order
Clean up resources after tests

3. Mock External Dependencies

@patch('requests.post')
def test_api_call(mock_post):
    mock_post.return_value.json.return_value = {"result": "success"}
    # Test code that uses requests.post

4. Parametrized Tests

@pytest.mark.parametrize("input,expected", [
    (0, 1),
    (1, 1),
    (5, 120),
    (10, 3628800)
])
def test_factorial_values(input, expected):
    assert factorial(input) == expected

5. Error Testing

def test_error_handling():
    with pytest.raises(ValueError, match="negative"):
        factorial(-1)

Debugging Tests

Pytest Options

# Show print statements
python -m pytest -s

# Drop into debugger on failure
python -m pytest --pdb

# Show local variables on failure
python -m pytest -l

# Run last failed tests
python -m pytest --lf

# Run specific test by name pattern
python -m pytest -k "test_generate"

Debugging Techniques

def test_with_debugging():
    """Test with debugging helpers."""
    # Add breakpoint
    import pdb; pdb.set_trace()
    
    # Print debug info
    print(f"Debug: {variable}")
    
    # Use pytest's capsys
    def test_output(capsys):
        print("Hello")
        captured = capsys.readouterr()
        assert captured.out == "Hello\n"

Next Steps

Review Contributing guide for development workflow
Check existing tests in the repository
Run the test suite locally
Add tests for new features