Advanced Features
Explore sophisticated capabilities of EiplGrader for complex grading scenarios.
Multiple Function Variants
Generate multiple implementations of the same function to find the best one.
Generating Variants
from eiplgrader.codegen import CodeGenerator
generator = CodeGenerator(api_key, client_type="openai", language="python")
# Generate 5 different implementations
result = generator.generate_code(
student_response="that implements binary search on a sorted list",
function_name="binary_search",
num_to_gen=5 # Generate 5 variants
)
# Access all variants
for i, code in enumerate(result["code"]):
print(f"Variant {i + 1}:")
print(code)
print("-" * 40)
Testing Multiple Variants
from eiplgrader.tester import CodeTester
# Test cases for binary search
test_cases = [
{
"parameters": {"arr": [1, 3, 5, 7, 9], "target": 5},
"expected": 2 # Index of 5
},
{
"parameters": {"arr": [1, 3, 5, 7, 9], "target": 6},
"expected": -1 # Not found
}
]
# Test each variant
best_variant = None
best_score = 0
for i, code in enumerate(result["code"]):
try:
tester = CodeTester(
code=code,
test_cases=test_cases,
function_name="binary_search",
language="python"
)
results = tester.run_tests()
score = results.successes / results.testsRun
print(f"Variant {i + 1}: {score * 100}% tests passed")
if score > best_score:
best_score = score
best_variant = i
except Exception as e:
print(f"Variant {i + 1} failed: {e}")
print(f"\nBest variant: #{best_variant + 1} with {best_score * 100}% success")
Code Segmentation
Map natural language explanations to specific code segments for detailed feedback.
Setting Up Segmentation
# Create few-shot examples for segmentation
segmentation_examples = [
{
"nl_explanation": "First, I check if the list is empty. Then I iterate through the list to find the maximum.",
"code": """def find_max(lst):
if not lst:
return None
max_val = lst[0]
for val in lst:
if val > max_val:
max_val = val
return max_val""",
"segmentation": [
{"segment": "I check if the list is empty", "lines": [2]},
{"segment": "I iterate through the list to find the maximum", "lines": [4, 5, 6, 7]}
]
}
]
# Save to JSON file
import json
with open("segmentation_few_shot.json", "w") as f:
json.dump(segmentation_examples, f, indent=2)
Using Segmentation
# Generate code with segmentation
result = generator.generate_code(
student_response="""First, I initialize an empty result list.
Then I loop through the input list.
For each element, if it's even, I add it to the result.
Finally, I return the result list.""",
function_name="filter_even",
segmentation_few_shot_file="segmentation_few_shot.json"
)
# Access segmentation results
if "segmentation" in result:
print("Code:")
print(result["code"][0])
print("\nSegmentation mapping:")
for segment in result["segmentation"]:
print(f"'{segment['segment']}' -> lines {segment['lines']}")
Providing Feedback with Segmentation
def provide_detailed_feedback(code, test_results, segmentation):
"""Provide line-specific feedback based on test failures."""
feedback = []
# Analyze which segments might be problematic
if not test_results.was_successful():
# Map errors to code segments
for result in test_results.test_results:
if not result["pass"]:
if "empty" in str(result["function_call"]).lower():
# Find segment related to empty list handling
for seg in segmentation:
if "empty" in seg["segment"].lower():
feedback.append({
"issue": "Empty list handling failed",
"segment": seg["segment"],
"lines": seg["lines"]
})
return feedback
# Example usage
feedback = provide_detailed_feedback(
result["code"][0],
test_results,
result["segmentation"]
)
for item in feedback:
print(f"Issue: {item['issue']}")
print(f"Related explanation: '{item['segment']}'")
print(f"Check lines: {item['lines']}")
In-Place Operations
Test functions that modify their arguments rather than returning new values.
Mode 0: Normal Return (Default)
# Standard function that returns a value
test_case = {
"parameters": {"numbers": [3, 1, 4, 1, 5]},
"expected": [1, 1, 3, 4, 5],
"inplace": "0" # Default - expects return value
}
# Generated function example:
# def sort_list(numbers):
# return sorted(numbers)
Mode 1: In-Place Modification
# Function modifies the input in place
test_case = {
"parameters": {"numbers": [3, 1, 4, 1, 5]},
"expected": [1, 1, 3, 4, 5],
"inplace": "1" # Tests that 'numbers' is modified
}
# Generated function example:
# def sort_list(numbers):
# numbers.sort() # Modifies in place, no return
Mode 2: Modify and Return
# Function both modifies and returns
test_case = {
"parameters": {"numbers": [3, 1, 4, 1, 5]},
"expected": [1, 1, 3, 4, 5],
"inplace": "2" # Tests both modification and return
}
# Generated function example:
# def sort_list(numbers):
# numbers.sort() # Modifies in place
# return numbers # Also returns the modified list
Language-Specific Examples
# Python - List modification
test_cases_python = [
{
"parameters": {"lst": [1, 2, 3], "value": 4},
"expected": [1, 2, 3, 4],
"inplace": "1" # append modifies list
}
]
# Java - Array modification
test_cases_java = [
{
"parameters": {"arr": [3, 1, 4], "n": 3},
"parameter_types": {"arr": "int[]", "n": "int"},
"expected": [1, 3, 4],
"expected_type": "int[]",
"inplace": "1" # Sort array in place
}
]
# C++ - Vector modification by reference
test_cases_cpp = [
{
"parameters": {"vec": [5, 2, 8, 1]},
"parameter_types": {"vec": "std::vector<int>"},
"expected": [1, 2, 5, 8],
"expected_type": "std::vector<int>",
"inplace": "1"
}
]
Custom Timeouts
Configure execution timeouts for long-running or complex functions.
# Set custom timeout per test case
test_cases = [
{
"parameters": {"n": 1000000},
"expected": "result",
"timeout": 60 # 60 seconds for this test
},
{
"parameters": {"n": 10},
"expected": "quick_result",
"timeout": 5 # 5 seconds for this test
}
]
# Or set timeout for all tests in the tester
tester = CodeTester(
code=code,
test_cases=test_cases,
function_name="process_data",
language="python"
)
Temperature Control
Adjust the creativity/randomness of generated code.
# Low temperature (0.2) - More deterministic, conventional solutions
conservative_result = generator.generate_code(
student_response="that sorts a list",
temperature=0.2
)
# High temperature (1.5) - More creative, varied solutions
creative_result = generator.generate_code(
student_response="that sorts a list",
temperature=1.5
)
# Compare different temperature outputs
for temp in [0.0, 0.5, 1.0, 1.5]:
result = generator.generate_code(
student_response="that calculates fibonacci numbers",
temperature=temp
)
print(f"Temperature {temp}:")
print(result["code"][0][:100] + "...") # First 100 chars
Model Selection
Choose different LLM models for generation.
# Use different models
models = ["gpt-4o", "gpt-4o-turbo", "gpt-3.5-turbo"]
for model in models:
result = generator.generate_code(
student_response="that implements quicksort",
model=model
)
print(f"Model {model} generated:")
print(result["code"][0])
Batch Processing with Parallel Execution
Process multiple students or tasks efficiently.
import concurrent.futures
from typing import List, Dict
def process_student_response(
generator: CodeGenerator,
response: str,
test_cases: List[Dict],
language: str = "python"
) -> Dict:
"""Process a single student response."""
try:
# Generate code
gen_result = generator.generate_code(response)
# Test code
tester = CodeTester(
code=gen_result["code"][0],
test_cases=test_cases,
language=language
)
test_result = tester.run_tests()
return {
"response": response,
"success": True,
"score": test_result.successes / test_result.testsRun,
"details": test_result
}
except Exception as e:
return {
"response": response,
"success": False,
"error": str(e)
}
# Process multiple responses concurrently
student_responses = [
"that calculates the mean of a list",
"that finds the median of a list",
"that computes the mode of a list",
"that calculates standard deviation"
]
# Define test cases for each task
test_cases_map = {
"mean": [{"parameters": {"lst": [1, 2, 3, 4, 5]}, "expected": 3.0}],
"median": [{"parameters": {"lst": [1, 2, 3, 4, 5]}, "expected": 3}],
"mode": [{"parameters": {"lst": [1, 2, 2, 3]}, "expected": 2}],
"standard deviation": [{"parameters": {"lst": [2, 4, 6]}, "expected": 2.0}]
}
# Parallel processing
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for response in student_responses:
# Determine which test cases to use
for key in test_cases_map:
if key in response.lower():
test_cases = test_cases_map[key]
break
future = executor.submit(
process_student_response,
generator,
response,
test_cases
)
futures.append(future)
# Collect results
results = [future.result() for future in futures]
# Display results
for result in results:
if result["success"]:
print(f"✓ {result['response']}: {result['score']*100:.1f}%")
else:
print(f"✗ {result['response']}: {result['error']}")
Custom Validation
Add custom validation logic beyond standard test cases.
def validate_code_style(code: str, language: str) -> List[str]:
"""Check code style and return issues."""
issues = []
if language == "python":
# Check for PEP 8 compliance
lines = code.split('\n')
for i, line in enumerate(lines):
if len(line) > 79:
issues.append(f"Line {i+1} exceeds 79 characters")
if '\t' in line:
issues.append(f"Line {i+1} uses tabs instead of spaces")
elif language == "java":
# Check for Java conventions
if "class Solution" not in code:
issues.append("Missing Solution class wrapper")
if not any(line.strip().startswith("import") for line in code.split('\n')):
issues.append("Consider adding necessary imports")
return issues
# Use custom validation
code = result["code"][0]
style_issues = validate_code_style(code, "python")
if style_issues:
print("Style issues found:")
for issue in style_issues:
print(f" - {issue}")
Structured Grading Rubrics
Create comprehensive grading rubrics combining multiple criteria.
class GradingRubric:
def __init__(self):
self.criteria = []
def add_criterion(self, name: str, weight: float, test_cases: List[Dict]):
self.criteria.append({
"name": name,
"weight": weight,
"test_cases": test_cases
})
def grade(self, code: str, function_name: str, language: str) -> Dict:
total_score = 0
results = {}
for criterion in self.criteria:
tester = CodeTester(
code=code,
test_cases=criterion["test_cases"],
function_name=function_name,
language=language
)
test_result = tester.run_tests()
score = (test_result.successes / test_result.testsRun) * criterion["weight"]
results[criterion["name"]] = {
"score": score,
"max_score": criterion["weight"],
"details": test_result
}
total_score += score
return {
"total_score": total_score,
"criteria_results": results
}
# Example rubric for a sorting function
rubric = GradingRubric()
# Basic functionality (40%)
rubric.add_criterion(
"basic_functionality",
40,
[
{"parameters": {"lst": [3, 1, 4]}, "expected": [1, 3, 4]},
{"parameters": {"lst": [1]}, "expected": [1]}
]
)
# Edge cases (30%)
rubric.add_criterion(
"edge_cases",
30,
[
{"parameters": {"lst": []}, "expected": []},
{"parameters": {"lst": [1, 1, 1]}, "expected": [1, 1, 1]}
]
)
# Performance (30%)
rubric.add_criterion(
"performance",
30,
[
{"parameters": {"lst": list(range(1000, 0, -1))}, "expected": list(range(1, 1001)), "timeout": 1}
]
)
# Grade the submission
grade_report = rubric.grade(code, "sort_list", "python")
print(f"Total Score: {grade_report['total_score']}/100")
Next Steps
- Review Test Case Format for complex test scenarios
Explore Language Support for language-specific advanced features
- See Developer Documentation for extending EiplGrader