symflower · zimmski · May 14, 2025 · May 14, 2025 · May 14, 2025 · May 14, 2025
diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -61,7 +61,7 @@ type Evaluate struct {
 // ProviderUrls holds all custom inference endpoint urls for the providers.
 ProviderUrls map[string]string `long:"urls" description:"Custom OpenAI API compatible inference endpoints (of the form '$provider:$url,...'). Use '$provider=custom-$name' to manually register a custom OpenAI API endpoint provider. Note that the models of a custom OpenAI API endpoint provider must be declared explicitly using the '--model' option. When using the environment variable, separate multiple definitions with ','." env:"PROVIDER_URL" env-delim:","`
 // APIRequestAttempts holds the number of allowed API requests per LLM query.
-APIRequestAttempts uint `long:"api-request-attempts" description:"Number of allowed API requests per LLM query." default:"3"`
+APIRequestAttempts uint `long:"api-request-attempts" description:"Number of allowed API requests per LLM query." default:"10"`
 // APIRequestTimeout holds the timeout for API requests in seconds.
 APIRequestTimeout uint `long:"api-request-timeout" description:"Timeout of API requests in seconds. ('0' to disable)" default:"1200"`
 
@@ -76,6 +76,8 @@ type Evaluate struct {
 Configuration string `long:"configuration" description:"Configuration file to set up an evaluation run."`
 // ExecutionTimeout holds the timeout for an execution.
 ExecutionTimeout uint `long:"execution-timeout" description:"Execution timeout for compilation and tests in minutes." default:"5"`
+// OnlyValidate indicates that only the configuration is validated and no evaluation is performed.
+OnlyValidate bool `long:"only-validate" description:"Only validate the configuration and do not perform an evaluation."`
 // RunIDStartsAt holds the offset increment for the run id used in creating the result folders.
 RunIDStartsAt uint `long:"run-id-starts-at" description:"Sets the starting index for the run ID." default:"1"`
 // Runs holds the number of runs to perform.
@@ -122,6 +124,22 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
 evaluationContext = &evaluate.Context{}
 evaluationConfiguration = NewEvaluationConfiguration()
 
+// Setup evaluation result directory.
+if !command.OnlyValidate {
+command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", command.timestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
+uniqueResultPath, err := util.UniqueDirectory(command.ResultPath)
+if err != nil {
+command.logger.Panicf("ERROR: %s", err)
+}
+// Ensure that the directory really exists.
+if err := osutil.MkdirAll(uniqueResultPath); err != nil {
+command.logger.Panicf("ERROR: %s", err)
+}
+command.ResultPath = uniqueResultPath
+evaluationContext.ResultPath = uniqueResultPath
+command.logger.Info("configured results directory", "path", command.ResultPath)
+}
+
 // Load the provided configuration file, if any.
 if command.Configuration != "" {
 if command.Runtime != "local" {
@@ -215,29 +233,6 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
 evaluationContext.NoDisqualification = command.NoDisqualification
 }
 
-// Setup evaluation result directory.
-{
-command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", command.timestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
-uniqueResultPath, err := util.UniqueDirectory(command.ResultPath)
-if err != nil {
-command.logger.Panicf("ERROR: %s", err)
-}
-// Ensure that the directory really exists.
-if err := osutil.MkdirAll(uniqueResultPath); err != nil {
-command.logger.Panicf("ERROR: %s", err)
-}
-command.ResultPath = uniqueResultPath
-evaluationContext.ResultPath = uniqueResultPath
-command.logger.Info("configured results directory", "path", command.ResultPath)
-}
-
-// Initialize logging within result directory.
-{
-log := command.logger.With(log.AttributeKeyResultPath, command.ResultPath)
-command.logger = log
-evaluationContext.Log = log
-}
-
 // Gather languages.
 languagesSelected := map[string]language.Language{}
 {
@@ -343,6 +338,10 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
 }
 evaluationContext.RepositoryPaths = command.Repositories
 evaluationConfiguration.Repositories.Selected = append(evaluationConfiguration.Repositories.Selected, command.Repositories...)
+
+for _, repositoryID := range evaluationConfiguration.Repositories.Selected {
+command.logger.Info("selected repository", "repository", repositoryID)
+}
 }
 
 // Make the resolved selected languages available in the command.
@@ -448,6 +447,7 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
 sort.Strings(command.ModelIDsWithProviderAndAttributes)
 
 // Check and initialize models.
+var unknownModels []string
 evaluationContext.ProviderForModel = map[model.Model]provider.Provider{}
 for _, modelIDsWithProviderAndAttributes := range command.ModelIDsWithProviderAndAttributes {
 command.logger.Info("selecting model", "model", modelIDsWithProviderAndAttributes)
@@ -502,7 +502,15 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
 var ok bool
 m, ok = models[modelIDWithProvider]
 if !ok {
-command.logger.Panicf("ERROR: model %q does not exist for provider %q. Valid models are: %s", modelIDsWithProviderAndAttributes, providerID, strings.Join(modelIDs, ", "))
+unknownModels = append(unknownModels, modelIDsWithProviderAndAttributes)
+command.logger.Error(
+"ERROR: model does not exist for provider",
+"model", modelIDsWithProviderAndAttributes,
+"provider", providerID,
+"valid", strings.Join(modelIDs, ", "),
+)
+
+continue
 }
 
 // If a model with attributes is requested, we add the base model plus attributes as new model to our list.
@@ -517,6 +525,12 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate.
 evaluationContext.ProviderForModel[m] = p
 evaluationConfiguration.Models.Selected = append(evaluationConfiguration.Models.Selected, modelIDsWithProviderAndAttributes)
 }
+
+if len(unknownModels) > 0 {
+sort.Strings(unknownModels)
+
+command.logger.Panicf("ERROR: found unknown providers or models: %s", strings.Join(unknownModels, ", "))
+}
 }
 
 return evaluationContext, evaluationConfiguration, func() {
@@ -540,17 +554,31 @@ func (command *Evaluate) Execute(args []string) (err error) {
 command.logger.Panicf("ERROR: empty evaluation configuration")
 }
 
-configurationFile, err := os.Create(filepath.Join(evaluationContext.ResultPath, "config.json"))
-if err != nil {
-command.logger.Panicf("ERROR: cannot create configuration file: %s", err)
+if command.OnlyValidate {
+return nil
 }
-defer func() {
-if err := configurationFile.Close(); err != nil {
+
+// Initialize logging within result directory.
+{
+log := command.logger.With(log.AttributeKeyResultPath, command.ResultPath)
+command.logger = log
+evaluationContext.Log = log
+}
+
+// Write the final evaluation configuration to the result directory.
+{
+configurationFile, err := os.Create(filepath.Join(evaluationContext.ResultPath, "config.json"))
+if err != nil {
+command.logger.Panicf("ERROR: cannot create configuration file: %s", err)
+}
+defer func() {
+if err := configurationFile.Close(); err != nil {
+command.logger.Panicf("ERROR: %s", err)
+}
+}()
+if err := evaluationConfiguration.Write(configurationFile); err != nil {
 command.logger.Panicf("ERROR: %s", err)
 }
-}()
-if err := evaluationConfiguration.Write(configurationFile); err != nil {
-command.logger.Panicf("ERROR: %s", err)
 }
 
 switch command.Runtime {