Skip to content
94 changes: 61 additions & 33 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ type Evaluate struct {
// ProviderUrls holds all custom inference endpoint urls for the providers.
ProviderUrls map[string]string `long:"urls" description:"Custom OpenAI API compatible inference endpoints (of the form '$provider:$url,...'). Use '$provider=custom-$name' to manually register a custom OpenAI API endpoint provider. Note that the models of a custom OpenAI API endpoint provider must be declared explicitly using the '--model' option. When using the environment variable, separate multiple definitions with ','." env:"PROVIDER_URL" env-delim:","`
// APIRequestAttempts holds the number of allowed API requests per LLM query.
APIRequestAttempts uint `long:"api-request-attempts" description:"Number of allowed API requests per LLM query." default:"3"`
APIRequestAttempts uint `long:"api-request-attempts" description:"Number of allowed API requests per LLM query." default:"10"`
// APIRequestTimeout holds the timeout for API requests in seconds.
APIRequestTimeout uint `long:"api-request-timeout" description:"Timeout of API requests in seconds. ('0' to disable)" default:"1200"`

Expand All @@ -76,6 +76,8 @@ type Evaluate struct {
Configuration string `long:"configuration" description:"Configuration file to set up an evaluation run."`
// ExecutionTimeout holds the timeout for an execution.
ExecutionTimeout uint `long:"execution-timeout" description:"Execution timeout for compilation and tests in minutes." default:"5"`
// OnlyValidate indicates that only the configuration is validated and no evaluation is performed.
OnlyValidate bool `long:"only-validate" description:"Only validate the configuration and do not perform an evaluation."`
// RunIDStartsAt holds the offset increment for the run id used in creating the result folders.
RunIDStartsAt uint `long:"run-id-starts-at" description:"Sets the starting index for the run ID." default:"1"`
// Runs holds the number of runs to perform.
Expand Down Expand Up @@ -122,6 +124,22 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
evaluationContext = &evaluate.Context{}
evaluationConfiguration = NewEvaluationConfiguration()

// Setup evaluation result directory.
if !command.OnlyValidate {
command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", command.timestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
uniqueResultPath, err := util.UniqueDirectory(command.ResultPath)
if err != nil {
command.logger.Panicf("ERROR: %s", err)
}
// Ensure that the directory really exists.
if err := osutil.MkdirAll(uniqueResultPath); err != nil {
command.logger.Panicf("ERROR: %s", err)
}
command.ResultPath = uniqueResultPath
evaluationContext.ResultPath = uniqueResultPath
command.logger.Info("configured results directory", "path", command.ResultPath)
}

// Load the provided configuration file, if any.
if command.Configuration != "" {
if command.Runtime != "local" {
Expand Down Expand Up @@ -215,29 +233,6 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
evaluationContext.NoDisqualification = command.NoDisqualification
}

// Setup evaluation result directory.
{
command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", command.timestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
uniqueResultPath, err := util.UniqueDirectory(command.ResultPath)
if err != nil {
command.logger.Panicf("ERROR: %s", err)
}
// Ensure that the directory really exists.
if err := osutil.MkdirAll(uniqueResultPath); err != nil {
command.logger.Panicf("ERROR: %s", err)
}
command.ResultPath = uniqueResultPath
evaluationContext.ResultPath = uniqueResultPath
command.logger.Info("configured results directory", "path", command.ResultPath)
}

// Initialize logging within result directory.
{
log := command.logger.With(log.AttributeKeyResultPath, command.ResultPath)
command.logger = log
evaluationContext.Log = log
}

// Gather languages.
languagesSelected := map[string]language.Language{}
{
Expand Down Expand Up @@ -343,6 +338,10 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
}
evaluationContext.RepositoryPaths = command.Repositories
evaluationConfiguration.Repositories.Selected = append(evaluationConfiguration.Repositories.Selected, command.Repositories...)

for _, repositoryID := range evaluationConfiguration.Repositories.Selected {
command.logger.Info("selected repository", "repository", repositoryID)
}
}

// Make the resolved selected languages available in the command.
Expand Down Expand Up @@ -448,6 +447,7 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
sort.Strings(command.ModelIDsWithProviderAndAttributes)

// Check and initialize models.
var unknownModels []string
evaluationContext.ProviderForModel = map[model.Model]provider.Provider{}
for _, modelIDsWithProviderAndAttributes := range command.ModelIDsWithProviderAndAttributes {
command.logger.Info("selecting model", "model", modelIDsWithProviderAndAttributes)
Expand Down Expand Up @@ -502,7 +502,15 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
var ok bool
m, ok = models[modelIDWithProvider]
if !ok {
command.logger.Panicf("ERROR: model %q does not exist for provider %q. Valid models are: %s", modelIDsWithProviderAndAttributes, providerID, strings.Join(modelIDs, ", "))
unknownModels = append(unknownModels, modelIDsWithProviderAndAttributes)
command.logger.Error(
"ERROR: model does not exist for provider",
"model", modelIDsWithProviderAndAttributes,
"provider", providerID,
"valid", strings.Join(modelIDs, ", "),
)

continue
}

// If a model with attributes is requested, we add the base model plus attributes as new model to our list.
Expand All @@ -517,6 +525,12 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
evaluationContext.ProviderForModel[m] = p
evaluationConfiguration.Models.Selected = append(evaluationConfiguration.Models.Selected, modelIDsWithProviderAndAttributes)
}

if len(unknownModels) > 0 {
sort.Strings(unknownModels)

command.logger.Panicf("ERROR: found unknown providers or models: %s", strings.Join(unknownModels, ", "))
}
}

return evaluationContext, evaluationConfiguration, func() {
Expand All @@ -540,17 +554,31 @@ func (command *Evaluate) Execute(args []string) (err error) {
command.logger.Panicf("ERROR: empty evaluation configuration")
}

configurationFile, err := os.Create(filepath.Join(evaluationContext.ResultPath, "config.json"))
if err != nil {
command.logger.Panicf("ERROR: cannot create configuration file: %s", err)
if command.OnlyValidate {
return nil
}
defer func() {
if err := configurationFile.Close(); err != nil {

// Initialize logging within result directory.
{
log := command.logger.With(log.AttributeKeyResultPath, command.ResultPath)
command.logger = log
evaluationContext.Log = log
}

// Write the final evaluation configuration to the result directory.
{
configurationFile, err := os.Create(filepath.Join(evaluationContext.ResultPath, "config.json"))
if err != nil {
command.logger.Panicf("ERROR: cannot create configuration file: %s", err)
}
defer func() {
if err := configurationFile.Close(); err != nil {
command.logger.Panicf("ERROR: %s", err)
}
}()
if err := evaluationConfiguration.Write(configurationFile); err != nil {
command.logger.Panicf("ERROR: %s", err)
}
}()
if err := evaluationConfiguration.Write(configurationFile); err != nil {
command.logger.Panicf("ERROR: %s", err)
}

switch command.Runtime {
Expand Down
Loading