Merge and aggregate datasets

Merge and aggregate datasets

Task
Merge and aggregate datasets
You are encouraged to solve this task according to the task description, using any language you may know.


Task

Merge and aggregate two datasets as provided in   .csv   files into a new resulting dataset.

Use the appropriate methods and data structures depending on the programming language.

Use the most common libraries only when built-in functionality is not sufficient.


Note

Either load the data from the   .csv   files or create the required data structures hard-coded.


patients.csv   file contents:

PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz 


visits.csv   file contents:

PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 


Create a resulting dataset in-memory or output it to screen or file, whichever is appropriate for the programming language at hand.

Merge and group per patient id and last name,   get the maximum visit date,   and get the sum and average of the scores per patient to get the resulting dataset.


Note that the visit date is purposefully provided as ISO format,   so that it could also be processed as text and sorted alphabetically to determine the maximum date.

| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | | 


Note

This task is aimed in particular at programming languages that are used in data science and data processing, such as F#, Python, R, SPSS, MATLAB etc.


Related tasks



Translation of: Python: Stdlib csv only
V patients_csv = ‘PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz’ V visits_csv = ‘PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3’ F csv2list(s) [[String]] rows L(row) s.split("\n") rows [+]= row.split(‘,’) R rows V patients = csv2list(patients_csv) V visits = csv2list(visits_csv) V result = copy(patients) result.sort_range(1..) result[0].append(‘LAST_VISIT’) V last = Dict(visits[1..], p_vis -> (p_vis[0], p_vis[1])) L(record) 1 .< result.len result[record].append(last.get(result[record][0], ‘’)) result[0] [+]= [‘SCORE_SUM’, ‘SCORE_AVG’] V n = Dict(patients[1..], p -> (p[0], 0)) V tot = Dict(patients[1..], p -> (p[0], 0.0)) L(record) visits[1..] V p = record[0] V score = record[2] I !score.empty n[p]++ tot[p] += Float(score) L(record) 1 .< result.len V p = result[record][0] I n[p] != 0 result[record] [+]= [‘#3.1’.format(tot[p]), ‘#2.2’.format(tot[p] / n[p])] E result[record] [+]= [‘’, ‘’] L(record) result print(‘| ’record.map(r -> r.center(10)).join(‘ | ’)‘ |’)
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | | 
Merge_and_aggregate(patients, visits){ ID := [], LAST_VISIT := [], SCORE_SUM := [], VISIT := [] for i, line in StrSplit(patients, "`n", "`r"){ if (i=1) continue x := StrSplit(line, ",") ID[x.1] := x.2 } for i, line in StrSplit(visits, "`n", "`r"){ if (i=1) continue x := StrSplit(line, ",") LAST_VISIT[x.1] := x.2 > LAST_VISIT[x.1] ? x.2 : LAST_VISIT[x.1] SCORE_SUM[x.1] := (SCORE_SUM[x.1] ? SCORE_SUM[x.1] : 0) + (x.3 ? x.3 : 0) if x.3 VISIT[x.1] := (VISIT[x.1] ? VISIT[x.1] : 0) + 1 } output := "PATIENT_ID`tLASTNAME`tLAST_VISIT`tSCORE_SUM`tSCORE_AVG`n" for id, name in ID output .= ID "`t" name "`t" LAST_VISIT[id] "`t" SCORE_SUM[id] "`t" SCORE_SUM[id]/VISIT[id] "`n" return output } 

Examples:

patients = ( PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz ) visits = ( PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 ) MsgBox % Merge_and_aggregate(patients, visits) return 
Output:
PATIENT_ID	LASTNAME	LAST_VISIT	SCORE_SUM	SCORE_AVG 1001	Hopper	2020-11-19	17.400000	5.800000 2002	Gosling	2020-10-08	6.800000	6.800000 3003	Kemeny	2020-11-12	0 4004	Wirth	2020-11-05	15.400000	7.700000 5005	Kurtz
# syntax: GAWK -f MERGE_AND_AGGREGATE_DATASETS.AWK RC-PATIENTS.CSV RC-VISITS.CSV # files may appear in any order # # sorting: # PROCINFO["sorted_in"] is used by GAWK # SORTTYPE is used by Thompson Automation's TAWK # { # printf("%s %s\n",FILENAME,$0) # print input split($0,arr,",") if (FNR == 1) { file = (arr[2] == "LASTNAME") ? "patients" : "visits" next } patient_id_arr[key] = key = arr[1] if (file == "patients") { lastname_arr[key] = arr[2] } else if (file == "visits") { if (arr[2] > visit_date_arr[key]) { visit_date_arr[key] = arr[2] } if (arr[3] != "") { score_arr[key] += arr[3] score_count_arr[key]++ } } } END { print("") PROCINFO["sorted_in"] = "@ind_str_asc" ; SORTTYPE = 1 fmt = "%-10s %-10s %-10s %9s %9s %6s\n" printf(fmt,"patient_id","lastname","last_visit","score_sum","score_avg","scores") for (i in patient_id_arr) { avg = (score_count_arr[i] > 0) ? score_arr[i] / score_count_arr[i] : "" printf(fmt,patient_id_arr[i],lastname_arr[i],visit_date_arr[i],score_arr[i],avg,score_count_arr[i]+0) } exit(0) } 
Output:
patient_id lastname last_visit score_sum score_avg scores 1001 Hopper 2020-11-19 17.4 5.8 3 2002 Gosling 2020-10-08 6.8 6.8 1 3003 Kemeny 2020-11-12 0 4004 Wirth 2020-11-05 15.4 7.7 2 5005 Kurtz 0 

Uses C++20

#include <iostream> #include <optional> #include <ranges> #include <string> #include <vector> using namespace std; struct Patient {  string ID;  string LastName; }; struct Visit {  string PatientID;  string Date;  optional<float> Score; }; int main(void)  {  auto patients = vector<Patient> {  {"1001", "Hopper"},  {"4004", "Wirth"},  {"3003", "Kemeny"},  {"2002", "Gosling"},  {"5005", "Kurtz"}};  auto visits = vector<Visit> {   {"2002", "2020-09-10", 6.8},  {"1001", "2020-09-17", 5.5},  {"4004", "2020-09-24", 8.4},  {"2002", "2020-10-08", },  {"1001", "" , 6.6},  {"3003", "2020-11-12", },  {"4004", "2020-11-05", 7.0},  {"1001", "2020-11-19", 5.3}};  // sort the patients by ID  sort(patients.begin(), patients.end(),   [](const auto& a, const auto&b){ return a.ID < b.ID;});   cout << "| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |\n";  for(const auto& patient : patients)  {  // loop over all of the patients and determine the fields  string lastVisit;  float sum = 0;  int numScores = 0;    // use C++20 ranges to filter the visits by patients  auto patientFilter = [&patient](const Visit &v){return v.PatientID == patient.ID;};  for(const auto& visit : visits | views::filter( patientFilter ))  {  if(visit.Score)  {  sum += *visit.Score;  numScores++;  }  lastVisit = max(lastVisit, visit.Date);  }    // format the output  cout << "| " << patient.ID << " | ";  cout.width(8); cout << patient.LastName << " | ";  cout.width(10); cout << lastVisit << " | ";  if(numScores > 0)  {  cout.width(9); cout << sum << " | ";  cout.width(9); cout << (sum / float(numScores));  }  else cout << " | ";  cout << " |\n";  } } 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | | | | 
using System; using System.Collections.Generic; using System.Globalization; using System.Linq; using System.Runtime.Serialization; public static class MergeAndAggregateDatasets {  public static void Main()  {  string patientsCsv = @" PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz";  string visitsCsv = @" PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3";  string format = "yyyy-MM-dd";  var formatProvider = new DateTimeFormat(format).FormatProvider;  var patients = ParseCsv(  patientsCsv.Split(Environment.NewLine, StringSplitOptions.RemoveEmptyEntries),  line => (PatientId: int.Parse(line[0]), LastName: line[1]));  var visits = ParseCsv(  visitsCsv.Split(Environment.NewLine, StringSplitOptions.RemoveEmptyEntries),  line => (  PatientId: int.Parse(line[0]),  VisitDate: DateTime.TryParse(line[1], formatProvider, DateTimeStyles.None, out var date) ? date : default(DateTime?),  Score: double.TryParse(line[2], out double score) ? score : default(double?)  )  );  var results =  patients.GroupJoin(visits,  p => p.PatientId,  v => v.PatientId,  (p, vs) => (  p.PatientId,  p.LastName,  LastVisit: vs.Max(v => v.VisitDate),  ScoreSum: vs.Sum(v => v.Score),  ScoreAvg: vs.Average(v => v.Score)  )  ).OrderBy(r => r.PatientId);  Console.WriteLine("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |");  foreach (var r in results) {  Console.WriteLine($"| {r.PatientId,-10} | {r.LastName,-8} | {r.LastVisit?.ToString(format) ?? "",-10} | {r.ScoreSum,9} | {r.ScoreAvg,9} |");  }  }  private static IEnumerable<T> ParseCsv<T>(string[] contents, Func<string[], T> constructor)  {  for (int i = 1; i < contents.Length; i++) {  var line = contents[i].Split(',');  yield return constructor(line);  }  } } 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 | | 3003 | Kemeny | 2020-11-12 | 0 | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | | 0 | |

The following implementation is for Souffle. Souffle does not currently include dates and times in the base distribution, but it is straightforward to use C timestamps:

// datetime.cpp #include <ctime> #include <cstdint> extern "C" {  int64_t from date(const char* string) {  struct tm tmInfo = {0};  strptime(string, "%Y-%m-%d", &tmInfo);  return mktime(&tmInfo); // localtime  } } 

Rather than combine the summaries, the date and score summaries have been presented separately.

/* rosetta.dl */ #define NaN to_float("NaN") .functor from_date(date:symbol) : number .decl Patient(id:number, lastname:symbol) .decl Visit(id:number, date:symbol, score:float) .decl SummaryDates(id:number, lastname:symbol, last_date:symbol) .decl SummaryScores(id:number, lastname:symbol, score_sum:float, score_mean:float) .decl MissingDates(x:number) Patient(1001,"Hopper"). Patient(4004,"Wirth"). Patient(3003,"Kemeny"). Patient(2002,"Gosling"). Patient(5005,"Kurtz"). Visit(2002,"2020-09-10",6.8). Visit(1001,"2020-09-17",5.5). Visit(4004,"2020-09-24",8.4). Visit(2002,"2020-10-08",NaN). Visit(1001,"",6.6). Visit(3003,"2020-11-12",NaN). Visit(4004,"2020-11-05",7.0). Visit(1001,"2020-11-19",5.3). MissingDates(@from_date("")) :- true. SummaryDates(id, lastname, last_date) :- Patient(id,lastname), last_timestamp = max ts: {Visit(id, date, _), ts = @from_date(date), !MissingDates(ts)}, Visit(id, last_date, _), last_timestamp = @from_date(last_date). SummaryScores(id, lastname, score_sum, score_mean) :- Patient(id,lastname), score_sum = sum score: {Visit(id, _, score), score != NaN}, score_mean = mean score: {Visit(id, _, score), score != NaN}. .output SummaryDates .output SummaryScores 

Then this is called using:

g++ -shared -fPIC datetime.cpp -o libfunctors.so souffle -D- rosetta.dl 
Output:
--------------- SummaryScores id	lastname	score_sum	score_mean =============== 1001	Hopper	17.399999999999999	5.7999999999999998 2002	Gosling	6.7999999999999998	6.7999999999999998 4004	Wirth	15.4	7.7000000000000002 =============== --------------- SummaryDates id	lastname	last_date =============== 1001	Hopper	2020-11-19 2002	Gosling	2020-10-08 3003	Kemeny	2020-11-12 4004	Wirth	2020-11-05 ===============

In this entry, the SQL for producing the final table is essentially the same as at #SQL on this page, and thus the main point of interest is probably the ease with which the CSV files can be imported.

.print Since the patients.csv file is quite ordinary, it can be imported without fuss. create or replace table patients as  FROM 'patients.csv'; .print Taking the same approach with visits.csv illustrates how DuckDB correctly infers the types, including the date: create or replace table visits as  FROM 'visits.csv'; from visits; .print .print Merge and group per patient id and last name, .print get the maximum visit date, and .print get the sum and average of the scores per patient to get the resulting dataset. SELECT p.PATIENT_ID, p.LASTNAME, MAX(VISIT_DATE) AS LAST_VISIT, SUM(SCORE) AS SCORE_SUM, CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG FROM patients p LEFT JOIN visits v ON v.PATIENT_ID = p.PATIENT_ID GROUP BY p.PATIENT_ID, p.LASTNAME ORDER BY p.PATIENT_ID; 
Output:
Since the patients.csv file is quite ordinary, it can be imported without fuss. Taking the same approach with visits.csv illustrates how DuckDB correctly infers the types, including the date: ┌────────────┬────────────┬────────┐ │ PATIENT_ID │ VISIT_DATE │ SCORE │ │ int64 │ date │ double │ ├────────────┼────────────┼────────┤ │ 2002 │ 2020-09-10 │ 6.8 │ │ 1001 │ 2020-09-17 │ 5.5 │ │ 4004 │ 2020-09-24 │ 8.4 │ │ 2002 │ 2020-10-08 │ │ │ 1001 │ │ 6.6 │ │ 3003 │ 2020-11-12 │ │ │ 4004 │ 2020-11-05 │ 7.0 │ │ 1001 │ 2020-11-19 │ 5.3 │ └────────────┴────────────┴────────┘ Merge and group per patient id and last name, get the maximum visit date, and get the sum and average of the scores per patient to get the resulting dataset. ┌────────────┬──────────┬────────────┬───────────┬───────────────┐ │ PATIENT_ID │ LASTNAME │ LAST_VISIT │ SCORE_SUM │ SCORE_AVG │ │ int64 │ varchar │ date │ double │ decimal(10,2) │ ├────────────┼──────────┼────────────┼───────────┼───────────────┤ │ 1001 │ Hopper │ 2020-11-19 │ 17.4 │ 5.80 │ │ 2002 │ Gosling │ 2020-10-08 │ 6.8 │ 6.80 │ │ 3003 │ Kemeny │ 2020-11-12 │ │ │ │ 4004 │ Wirth │ 2020-11-05 │ 15.4 │ 7.70 │ │ 5005 │ Kurtz │ │ │ │ └────────────┴──────────┴────────────┴───────────┴───────────────┘ 
s$ = input repeat s$ = input until s$ = "" pat$[][] &= strsplit s$ "," pat$[$][] &= "" . proc sort &d$[][] . for i = len d$[][] - 1 downto 1 : for j = 1 to i if strcmp d$[j][1] d$[j + 1][1] > 0 swap d$[j][] d$[j + 1][] . . . sort pat$[][] n = len pat$[][] len sum[] n len cnt[] n # s$ = input repeat s$ = input until s$ = "" vis$[] = strsplit s$ "," for i to n if pat$[i][1] = vis$[1] if strcmp vis$[2] pat$[i][3] > 0 pat$[i][3] = vis$[2] . sum[i] += number vis$[3] cnt[i] += 1 . . . func$ f s$ n . s$ = " " & s$ while len s$ < n : s$ &= " " return s$ . print "PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG" for i to n if sum[i] > 0 sum$ = f sum[i] 11 cnt$ = f sum[i] / cnt[i] 11 else sum$ = f "" 11 cnt$ = f "" 11 . print f pat$[i][1] 11 & "|" & f pat$[i][2] 10 & "|" & f pat$[i][3] 12 & "|" & sum$ & "|" & cnt$ . # input_data PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3
Output:
PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG 1001 | Hopper | 2020-11-19 | 17.40 | 5.80 2002 | Gosling | 2020-10-08 | 6.80 | 3.40 3003 | Kemeny | 2020-11-12 | | 4004 | Wirth | 2020-11-05 | 15.40 | 7.70 5005 | Kurtz | | | 

Stdlib csv only

Using only standard libraries and input from csv files.

Translation of: C++
Type Patient  ID As String  LastName As String End Type Type Visit  PatientID As String  Fecha As String  Score As Single  HasScore As Integer End Type Dim As Patient patients(5) Dim As Visit visits(8) patients(1).ID = "1001": patients(1).LastName = "Hopper" patients(2).ID = "4004": patients(2).LastName = "Wirth" patients(3).ID = "3003": patients(3).LastName = "Kemeny" patients(4).ID = "2002": patients(4).LastName = "Gosling" patients(5).ID = "5005": patients(5).LastName = "Kurtz" visits(1).PatientID = "2002": visits(1).Fecha = "2020-09-10": visits(1).Score = 6.8: visits(1).HasScore = -1 visits(2).PatientID = "1001": visits(2).Fecha = "2020-09-17": visits(2).Score = 5.5: visits(2).HasScore = -1 visits(3).PatientID = "4004": visits(3).Fecha = "2020-09-24": visits(3).Score = 8.4: visits(3).HasScore = -1 visits(4).PatientID = "2002": visits(4).Fecha = "2020-10-08": visits(4).HasScore = 0 visits(5).PatientID = "1001": visits(5).Fecha = "" : visits(5).Score = 6.6: visits(5).HasScore = -1 visits(6).PatientID = "3003": visits(6).Fecha = "2020-11-12": visits(6).HasScore = 0 visits(7).PatientID = "4004": visits(7).Fecha = "2020-11-05": visits(7).Score = 7.0: visits(7).HasScore = -1 visits(8).PatientID = "1001": visits(8).Fecha = "2020-11-19": visits(8).Score = 5.3: visits(8).HasScore = -1 Print "| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |" For i As Integer = 1 To 5  Dim As String lastVisit = ""  Dim As Single sum = 0  Dim As Integer numScores = 0  For j As Integer = 1 To 8  If patients(i).ID = visits(j).PatientID Then  If visits(j).HasScore Then  sum += visits(j).Score  numScores += 1  End If  If visits(j).Fecha > lastVisit Then  lastVisit = visits(j).Fecha  End If  End If  Next j  Print "| "; patients(i).ID; " | ";  Print Using "\ \ | \ \ | "; patients(i).LastName; lastVisit;  If numScores > 0 Then  Print Using "#######.# | #######.#"; sum; (sum / Csng(numScores));  Else  Print " | ";  End If  Print " |" Next i Sleep 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 3003 | Kemeny | 2020-11-12 | | | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 | | 5005 | Kurtz | | | |

Stdlib sqlite3 and csv only

Library: SQLite

Using the csv module only to parse the input; and the sqlite3 module (a standard library that comes with the base FreeBASIC installation) to calculate the output.

Translation of: Python
#include once "sqlite3.bi" #include once "string.bi" #include once "crt.bi" ' Callback para SQLite Function callback Cdecl (Byval NotUsed As Any Ptr, Byval argc As Integer, _  Byval argv As ZString Ptr Ptr, Byval colName As ZString Ptr Ptr) As Integer    For i As Integer = 0 To argc - 1  If argv[i] <> NULL Then  Print *colName[i] & ": " & *argv[i] & " ";  Else  Print *colName[i] & ": NULL ";  End If  Next  Print    Return 0 End Function Sub createCSVfiles()  Dim As Integer ff    If Dir("patients.csv") = "" Then  ff = Freefile  Open "patients.csv" For Output As #ff  Print #ff, "PATIENT_ID,LASTNAME"  Print #ff, "1001,Hopper"  Print #ff, "4004,Wirth"  Print #ff, "3003,Kemeny"  Print #ff, "2002,Gosling"  Print #ff, "5005,Kurtz"  Close #ff  End If    If Dir("visits.csv") = "" Then  ff = Freefile  Open "visits.csv" For Output As #ff  Print #ff, "PATIENT_ID,VISIT_DATE,SCORE"  Print #ff, "2002,2020-09-10,6.8"  Print #ff, "1001,2020-09-17,5.5"  Print #ff, "4004,2020-09-24,8.4"  Print #ff, "2002,2020-10-08,"  Print #ff, "1001,,6.6"  Print #ff, "3003,2020-11-12,"  Print #ff, "4004,2020-11-05,7.0"  Print #ff, "1001,2020-11-19,5.3"  Close #ff  End If End Sub Sub createTableHeaders(db As sqlite3 Ptr)  Dim As ZString Ptr errMsg = NULL    ' Create the patient and visit table  sqlite3_exec(db, "CREATE TABLE IF NOT EXISTS patients (PATIENT_ID INT, LASTNAME TEXT);", Cast(Any Ptr, @callback), NULL, @errMsg)  If errMsg <> NULL Then  Print "Error creating patients table: " & *errMsg  sqlite3_free(errMsg)  End If    sqlite3_exec(db, "CREATE TABLE IF NOT EXISTS visits (PATIENT_ID INT, VISIT_DATE TEXT, SCORE NUMERIC(4,1));", Cast(Any Ptr, @callback), NULL, @errMsg)  If errMsg <> NULL Then  Print "Error creating visits table: " & *errMsg  sqlite3_free(errMsg)  End If End Sub Sub fillTablesFromCSV(db As sqlite3 Ptr)  Dim As Integer ff, i  Dim As String lineaCSV, query  Dim As ZString Ptr errMsg = NULL    ff = Freefile  Open "patients.csv" For Input As #ff  Line Input #ff, lineaCSV ' Saltar la cabecera    While Not Eof(ff)  Line Input #ff, lineaCSV  Dim As String id = "", lastname = ""    i = 1  While i <= Len(lineaCSV) And Mid(lineaCSV, i, 1) <> ","  id += Mid(lineaCSV, i, 1)  i += 1  Wend  i += 1 ' Skip the comma    While i <= Len(lineaCSV)  lastname += Mid(lineaCSV, i, 1)  i += 1  Wend    query = "INSERT INTO patients VALUES (" & id & ", '" & lastname & "');"    sqlite3_exec(db, query, NULL, NULL, @errMsg)  If errMsg <> NULL Then  Print "Error inserting patient: " & *errMsg  sqlite3_free(errMsg)  End If  Wend  Close #ff    ff = Freefile  Open "visits.csv" For Input As #ff  Line Input #ff, lineaCSV ' Skip header    While Not Eof(ff)  Line Input #ff, lineaCSV  Dim As String id = "", dateStr = "", score = ""  Dim As Integer fieldNum = 0    For i = 1 To Len(lineaCSV)  If Mid(lineaCSV, i, 1) = "," Then  fieldNum += 1  Else  Select Case fieldNum  Case 0: id += Mid(lineaCSV, i, 1)  Case 1: dateStr += Mid(lineaCSV, i, 1)  Case 2: score += Mid(lineaCSV, i, 1)  End Select  End If  Next i    query = "INSERT INTO visits VALUES (" & id & ", "    If dateStr = "" Then  query &= "NULL, "  Else  query &= "'" & dateStr & "', "  End If    If score = "" Then  query &= "NULL);"  Else  query &= score & ");"  End If    sqlite3_exec(db, query, NULL, NULL, @errMsg)  If errMsg <> NULL Then  Print "Error inserting visit: " & *errMsg  sqlite3_free(errMsg)  End If  Wend  Close #ff End Sub Function joinTablesAndGroup(db As sqlite3 Ptr) As String  Dim As sqlite3_stmt Ptr stmt  Dim As String result = ""  Dim As ZString Ptr errMsg = NULL    Dim As String query = _  "SELECT " & _  " p.PATIENT_ID, " & _  " p.LASTNAME, " & _  " MAX(v.VISIT_DATE) AS LAST_VISIT, " & _  " SUM(v.SCORE) AS SCORE_SUM, " & _  " ROUND(AVG(v.SCORE), 1) AS SCORE_AVG " & _  "FROM " & _  " patients p " & _  "LEFT JOIN " & _  " visits v ON p.PATIENT_ID = v.PATIENT_ID " & _  "GROUP BY " & _  " p.PATIENT_ID, p.LASTNAME " & _  "ORDER BY " & _  " p.PATIENT_ID;"    If sqlite3_prepare_v2(db, query, -1, @stmt, NULL) <> SQLITE_OK Then  Print "Error preparing statement: " & *sqlite3_errmsg(db)  Return "Error"  End If    result = "| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |" & Chr(10)  result &= "|------------|----------|------------|-----------|-----------|" & Chr(10)    While sqlite3_step(stmt) = SQLITE_ROW  Dim As String row = "| "    ' PATIENT_ID  If sqlite3_column_type(stmt, 0) <> SQLITE_NULL Then  row &= " " & Trim(Str(sqlite3_column_int(stmt, 0))) & " | "  Else  row &= Space(10) & "| "  End If    ' LASTNAME  If sqlite3_column_type(stmt, 1) <> SQLITE_NULL Then  Dim As String lastname = *Cast(ZString Ptr, sqlite3_column_text(stmt, 1))  row &= lastname & Space(9 - Len(lastname)) & "| "  Else  row &= " | "  End If    ' LAST_VISIT  If sqlite3_column_type(stmt, 2) <> SQLITE_NULL Then  Dim As String visit_date = *Cast(ZString Ptr, sqlite3_column_text(stmt, 2))  row &= visit_date & Space(11 - Len(visit_date)) & "| "  Else  row &= " None | "  End If    ' SCORE_SUM  If sqlite3_column_type(stmt, 3) <> SQLITE_NULL Then  Dim As Double score_sum = sqlite3_column_double(stmt, 3)  row &= " " & Format(score_sum, "#00.#") & " | "  Else  row &= " None | "  End If    ' SCORE_AVG  If sqlite3_column_type(stmt, 4) <> SQLITE_NULL Then  Dim As Double score_avg = sqlite3_column_double(stmt, 4)  row &= " " & Format(score_avg, "##.#") & " |"  Else  row &= " None |"  End If    result &= row & Chr(10)  Wend    sqlite3_finalize(stmt)  Return result End Function ' Main program Dim As sqlite3 Ptr db Dim As Integer rc createCSVfiles() rc = sqlite3_open(":memory:", @db) If rc <> SQLITE_OK Then  Print "Error opening database: " & *sqlite3_errmsg(db)  sqlite3_close(db)  Sleep: End 1 End If createTableHeaders(db) fillTablesFromCSV(db) Dim As String result = joinTablesAndGroup(db) Print result sqlite3_close(db) Sleep 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | |------------|----------|------------|-----------|-----------| | 1001 | Hopper | 2020-11-19 | 17,4 | 5,8 | | 2002 | Gosling | 2020-10-08 | 06,8 | 6,8 | | 3003 | Kemeny | 2020-11-12 | None | None | | 4004 | Wirth | 2020-11-05 | 15,4 | 7,7 | | 5005 | Kurtz | None | None | None |


Note that the scores are right justified to copy the task description. It would be more natural to leave them right justified.

// Merge and aggregate datasets. Nigel Galloway: January 6th., 2021 let rFile(fName)=seq{use n=System.IO.File.OpenText(fName)  n.ReadLine() |> ignore  while not n.EndOfStream do yield n.ReadLine().Split [|','|]} let N=rFile("file1.txt") |> Seq.sort let G=rFile("file2.txt") |> Seq.groupBy(fun n->n.[0]) |> Map.ofSeq let fN n i g e l=printfn "| %-10s | %-8s | %10s |  %-9s | %-9s |" n i g e l  let fG n g=let z=G.[n]|>Seq.sumBy(fun n->try float n.[2] with :? System.FormatException->0.0)  fN n g (G.[n]|>Seq.sort|>Seq.last).[1] (if z=0.0 then "" else string z) (if z=0.0 then "" else string(z/(float(Seq.length G.[n])))) 
Output:
printfn "| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |" N|>Seq.iter(fun n->match G.ContainsKey n.[0] with true->fG n.[0] n.[1] |_->fN n.[0] n.[1] "" "" "") | PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 3.4 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | | | | 
Translation of: Wren
package main import (  "fmt"  "math"  "sort" ) type Patient struct {  id int  lastName string } // maps an id to a lastname var patientDir = make(map[int]string) // maintains a sorted list of ids var patientIds []int func patientNew(id int, lastName string) Patient {  patientDir[id] = lastName  patientIds = append(patientIds, id)  sort.Ints(patientIds)  return Patient{id, lastName} } type DS struct {  dates []string  scores []float64 } type Visit struct {  id int  date string  score float64 } // maps an id to lists of dates and scores var visitDir = make(map[int]DS) func visitNew(id int, date string, score float64) Visit {  if date == "" {  date = "0000-00-00"  }  v, ok := visitDir[id]  if ok {  v.dates = append(v.dates, date)  v.scores = append(v.scores, score)  visitDir[id] = DS{v.dates, v.scores}  } else {  visitDir[id] = DS{[]string{date}, []float64{score}}  }  return Visit{id, date, score} } type Merge struct{ id int } func (m Merge) lastName() string { return patientDir[m.id] } func (m Merge) dates() []string { return visitDir[m.id].dates } func (m Merge) scores() []float64 { return visitDir[m.id].scores } func (m Merge) lastVisit() string {  dates := m.dates()  dates2 := make([]string, len(dates))  copy(dates2, dates)  sort.Strings(dates2)  return dates2[len(dates2)-1] } func (m Merge) scoreSum() float64 {  sum := 0.0  for _, score := range m.scores() {  if score != -1 {  sum += score  }  }  return sum } func (m Merge) scoreAvg() float64 {  count := 0  for _, score := range m.scores() {  if score != -1 {  count++  }  }  return m.scoreSum() / float64(count) } func mergePrint(merges []Merge) {  fmt.Println("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |")  f := "| %d | %-7s | %s | %4s | %4s |\n"  for _, m := range merges {  _, ok := visitDir[m.id]  if ok {  lv := m.lastVisit()  if lv == "0000-00-00" {  lv = " "  }  scoreSum := m.scoreSum()  ss := fmt.Sprintf("%4.1f", scoreSum)  if scoreSum == 0 {  ss = " "  }  scoreAvg := m.scoreAvg()  sa := " "  if !math.IsNaN(scoreAvg) {  sa = fmt.Sprintf("%4.2f", scoreAvg)  }  fmt.Printf(f, m.id, m.lastName(), lv, ss, sa)  } else {  fmt.Printf(f, m.id, m.lastName(), " ", " ", " ")  }  } } func main() {  patientNew(1001, "Hopper")  patientNew(4004, "Wirth")  patientNew(3003, "Kemeny")  patientNew(2002, "Gosling")  patientNew(5005, "Kurtz")  visitNew(2002, "2020-09-10", 6.8)  visitNew(1001, "2020-09-17", 5.5)  visitNew(4004, "2020-09-24", 8.4)  visitNew(2002, "2020-10-08", -1) // -1 signifies no score  visitNew(1001, "", 6.6) // "" signifies no date  visitNew(3003, "2020-11-12", -1)  visitNew(4004, "2020-11-05", 7.0)  visitNew(1001, "2020-11-19", 5.3)  merges := make([]Merge, len(patientIds))  for i, id := range patientIds {  merges[i] = Merge{id}  }  mergePrint(merges) } 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | | 

Reading and merging

Merging of fields and databases is defined as a monoid operation for corresponding types.

import Data.List import Data.Maybe import System.IO (readFile) import Text.Read (readMaybe) import Control.Applicative ((<|>)) ------------------------------------------------------------ newtype DB = DB { entries :: [Patient] }  deriving Show instance Semigroup DB where  DB a <> DB b = normalize $ a <> b instance Monoid DB where  mempty = DB [] normalize :: [Patient] -> DB normalize = DB  . map mconcat   . groupBy (\x y -> pid x == pid y)  . sortOn pid   ------------------------------------------------------------ data Patient = Patient { pid :: String  , name :: Maybe String  , visits :: [String]  , scores :: [Float] }  deriving Show instance Semigroup Patient where  Patient p1 n1 v1 s1 <> Patient p2 n2 v2 s2 =  Patient (fromJust $ Just p1 <|> Just p2)  (n1 <|> n2)  (v1 <|> v2)  (s1 <|> s2) instance Monoid Patient where  mempty = Patient mempty mempty mempty mempty   ------------------------------------------------------------ readDB :: String -> DB readDB = normalize  . mapMaybe readPatient  . readCSV readPatient r = do  i <- lookup "PATIENT_ID" r  let n = lookup "LASTNAME" r  let d = lookup "VISIT_DATE" r >>= readDate  let s = lookup "SCORE" r >>= readMaybe  return $ Patient i n (maybeToList d) (maybeToList s)  where  readDate [] = Nothing  readDate d = Just d readCSV :: String -> [(String, String)] readCSV txt = zip header <$> body  where  header:body = splitBy ',' <$> lines txt  splitBy ch = unfoldr go  where  go [] = Nothing  go s = Just $ drop 1 <$> span (/= ch) s 
let patients = readDB <$> readFile "patients.csv" *Main> let visits = readDB <$> readFile "visits.csv" *Main> mapM_ print . entries =<< patients Patient {pid = "1001", name = Just "Hopper", visits = [], scores = []} Patient {pid = "2002", name = Just "Gosling", visits = [], scores = []} Patient {pid = "3003", name = Just "Kemeny", visits = [], scores = []} Patient {pid = "4004", name = Just "Wirth", visits = [], scores = []} Patient {pid = "5005", name = Just "Kurtz", visits = [], scores = []} *Main> mapM_ print . entries =<< visits Patient {pid = "1001", name = Nothing, visits = ["2020-09-17","2020-11-19"], scores = [5.3,6.6,5.5]} Patient {pid = "2002", name = Nothing, visits = ["2020-09-10","2020-10-08"], scores = [6.8]} Patient {pid = "3003", name = Nothing, visits = ["2020-11-12"], scores = []} Patient {pid = "4004", name = Nothing, visits = ["2020-09-24","2020-11-05"], scores = [7.0,8.4]} *Main> mapM_ print . entries =<< patients <> visits Patient {pid = "1001", name = Just "Hopper", visits = ["2020-09-17","2020-11-19"], scores = [5.3,6.6,5.5]} Patient {pid = "2002", name = Just "Gosling", visits = ["2020-09-10","2020-10-08"], scores = [6.8]} Patient {pid = "3003", name = Just "Kemeny", visits = ["2020-11-12"], scores = []} Patient {pid = "4004", name = Just "Wirth", visits = ["2020-09-24","2020-11-05"], scores = [7.0,8.4]} Patient {pid = "5005", name = Just "Kurtz", visits = [], scores = []}

Pretty tabulation

tabulateDB (DB ps) header cols = intercalate "|" <$> body  where  body = transpose $ zipWith pad width table  table = transpose $ header : map showPatient ps  showPatient p = sequence cols p  width = maximum . map length <$> table  pad n col = (' ' :) . take (n+1) . (++ repeat ' ') <$> col main = do  a <- readDB <$> readFile "patients.csv"  b <- readDB <$> readFile "visits.csv"  mapM_ putStrLn $ tabulateDB (a <> b) header fields  where  header = [ "PATIENT_ID", "LASTNAME", "VISIT_DATE"  , "SCORES SUM","SCORES AVG"]  fields = [ pid  , fromMaybe [] . name  , \p -> case visits p of {[] -> []; l -> last l}  , \p -> case scores p of {[] -> []; s -> show (sum s)}  , \p -> case scores p of {[] -> []; s -> show (mean s)} ]  mean lst = sum lst / genericLength lst 
*Main> main PATIENT_ID | LASTNAME | VISIT_DATE | SCORES SUM | SCORES AVG 1001 | Hopper | 2020-11-19 | 17.4 | 5.7999997 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 3003 | Kemeny | 2020-11-12 | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 5005 | Kurtz | | | 

Harbour does not have special values for NA or NaN, and missing numerical values are represented as zeros. In the following, we have used -999 for missing scores.

#xcommand INSERT INTO <table> ( <uField1>[, <uFieldN> ] ) VALUE ( <uVal1>[, <uValN> ] ) => ; <table>->(dbAppend()); <table>-><uField1> := <uVal1> [; <table>-><uFieldN> := <uValN>] #xcommand INSERT INTO <table> ( <uFieldList,...> ) VALUES ( <uValList1,...> ) [, ( <uValListN,...> )] => ; INSERT INTO <table> ( <uFieldList> ) VALUE ( <uValList1> ) ; [; INSERT INTO <table> ( <uFieldList> ) VALUE ( <uValListN> )] && Singular cases (so we can use VALUES for all instances) #xcommand INSERT INTO <table> (<uField>) VALUE (<uVal>) => ; <table>->(dbAppend()); <table>-><uField> := <uVal> #xcommand INSERT INTO <table> ( <uFieldList,...> ) VALUES ( <uValList,...> ) => ; INSERT INTO <table> ( <uFieldList> ) VALUE ( <uValList> ) PROCEDURE Main() LOCAL pStruct, vStruct, rStruct, xCurId, aAgg SET DATE FORMAT "yyyy-mm-dd" && create and populate the patient table pStruct := {{"patient_id", "n", 8, 0}, {"lastname", "c", 10, 0 }} dbCreate( "patient", pStruct,, .T., "patient" ) INSERT INTO patient (patient_id, lastname) VALUES (1001, "Hopper"), (4004, "Wirth"), ; (3003, "Kemeny"), (2002, "Gosling"), (5005, "Kurtz") INDEX ON patient_id TO pat_id && create and populate the visit table vStruct := {{"patient_id", "n", 8, 0}, {"visit_date", "d", 10, 0}, {"score", "n", 8, 1}} dbCreate( "visit", vStruct,, .T., "visit" ) INSERT INTO visit (patient_id, visit_date, score) VALUES (2002, ctod("2020-09-10"), 6.8), ; (1001, ctod("2020-09-17"), 5.5), (4004, ctod("2020-09-24"), 8.4), ; (2002, ctod("2020-10-08"), -999), (1001, ctod("1900-01-01"), 6.6), ; (3003, ctod("2020-11-12"), -999), (4004, ctod("2020-11-05"), 7.0), ; (1001, ctod("2020-11-19"), 5.3) INDEX ON patient_id TO visit_id && create the result table rStruct := { {"patient_id", "n", 8, 0}, ; {"n", "i", 8, 0}, {"sum_score", "n", 8, 1}, ; {"avg_score", "n", 8, 1}, {"max_date", "d", 10, 0}} dbCreate("report", rStruct,, .T., "report") SELECT visit DO WHILE ! Eof() xCurId := patient_id && grouping variable aAgg := {0, 0, 0.0, ctod("1900-01-01")} && initial values DO WHILE ! Eof() .AND. xCurId == patient_id aAgg := {1+aAgg[1], iif(score==-999,aAgg[2],1+aAgg[2]), ; iif(score==-999, aAgg[3], score+aAgg[3]), max(visit_date, aAgg[4])} && update SKIP ENDDO INSERT INTO report (patient_id, n, sum_score, avg_score, max_date) ; VALUES (xCurId, aAgg[1], aAgg[3], aAgg[3]/aAgg[2], aAgg[4]) ENDDO SELECT report INDEX ON patient_id TO report_id SELECT patient SET RELATION TO patient_id INTO report  ? "NUM", "PATIENT_ID", "LASTNAME", "N", "SUM_SCORE", "AVG_SCORE", "MAX_DATE" LIST patient_id, lastname, report->n, report->sum_score, report->avg_score, report->max_date RETURN 

With output:

NUM PATIENT_ID LASTNAME N SUM_SCORE AVG_SCORE MAX_DATE 1 1001 Hopper 3 17.4 5.8 2020-11-19 4 2002 Gosling 2 6.8 6.8 2020-10-08 3 3003 Kemeny 1 0.0 0.0 2020-11-12 2 4004 Wirth 2 15.4 7.7 2020-11-05 5 5005 Kurtz 0 0.0 0.0 - - 

J

One approach here would be to use Jd

In other words, we can set things up like this:

NB. setup: require'jd pacman' load JDP,'tools/csv_load.ijs' F=: jpath '~temp/rosettacode/example/CSV' jdcreatefolder_jd_ CSVFOLDER=: F assert 0<{{)n PATIENTID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz }} fwrite F,'patients.csv' assert 0<{{)n PATIENTID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 }} fwrite F,'visits.csv' csvprepare 'patients';F,'patients.csv' csvprepare 'visits';F,'visits.csv' csvload 'patients';1 csvload 'visits';1 jd'ref patients PATIENTID visits PATIENTID' 

And, then we can run our query:

require'jd' echo jd {{)n  reads  PATIENT_ID: first p.PATIENTID,  LASTNAME: first p.LASTNAME,  LAST_VISIT: max v.VISIT_DATE,  SCORE_SUM: sum v.SCORE,  SCORE_AVG: avg v.SCORE  by  p.PATIENTID  from  p:patients,  v:p.visits }} -.LF 

Which displays this result:

┌───────────┬──────────┬────────┬──────────┬─────────┬─────────┐ │p.PATIENTID│PATIENT_ID│LASTNAME│LAST_VISIT│SCORE_SUM│SCORE_AVG│ ├───────────┼──────────┼────────┼──────────┼─────────┼─────────┤ │1001 │1001 │Hopper │2020-09-17│5.5 │5.5 │ │4004 │4004 │Wirth │2020-09-24│8.4 │8.4 │ │3003 │3003 │Kemeny │2020-11-12│ __ │ __ │ │2002 │2002 │Gosling │2020-09-10│6.8 │6.8 │ │5005 │5005 │Kurtz │? │ 0 │ 0 │ └───────────┴──────────┴────────┴──────────┴─────────┴─────────┘

Another approach would be to use J's csv library:

require'csv' patients=: fixcsv {{)n PATIENTID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz }} visits=: fixcsv {{)n PATIENTID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 }} task=: {{  P=. <@:>"1|:/:~}.patients  V=. <@:>"1|:/:~}.visits  id=. 0 {:: P  nm=. 1 {:: P  sel1=. (0 {:: P) e. 0 {:: V  sel2=. (~.0 {:: V) e. 0 {:: P NB. unnecessary for this example  exp=. sel1 #inv sel2 # ]  agg=. /.(&.:".)  vdt=. exp (0 {:: V) {:/. 1 {:: V  sum=. exp ":,.(0 {:: V) +//. 0". 2 {:: V  avg=. exp ":,.(0 {:: V) (+/%#)/. 0". 2 {:: V  labels=. ;:'PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG'  labels,:id;nm;vdt;sum;avg }} 

Here:

 task'' ┌──────────┬────────┬──────────┬─────────┬─────────┐ PATIENT_IDLASTNAMELAST_VISITSCORE_SUMSCORE_AVG ├──────────┼────────┼──────────┼─────────┼─────────┤ 1001 Hopper 2020-11-1917.4 5.8  2002 Gosling 2020-10-08 6.8 3.4  3003 Kemeny 2020-11-12 0  0  4004 Wirth 2020-11-0515.4 7.7  5005 Kurtz     └──────────┴────────┴──────────┴─────────┴─────────┘ 

If the empty score in visits was a display concern, we might instead do it this way:

task=: {{  P=. <@:>"1|:/:~}.patients  V=. <@:>"1|:/:~}.visits  id=. 0 {:: P  nm=. 1 {:: P  sel1=. (0 {:: P) e. 0 {:: V  sel2=. (~.0 {:: V) e. 0 {:: P NB. unnecessary for this example  exp=. sel1 #inv sel2 # ]  agg=. /.(&.:".)  vdt=. exp (0 {:: V) {:/. 1 {:: V  sel3=. (0 {:: V) +.//. 2 *@#@{::"1 }.visits  exp2=: [:exp sel3 #inv sel3 #]  sum=. exp2 ":,.(0 {:: V) +//. 0". 2 {:: V  avg=. exp2 ":,.(0 {:: V) (+/%#)/. 0". 2 {:: V  labels=. ;:'PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG'  labels,:id;nm;vdt;sum;avg }} 

Which gives us:

 task'' ┌──────────┬────────┬──────────┬─────────┬─────────┐ PATIENT_IDLASTNAMELAST_VISITSCORE_SUMSCORE_AVG ├──────────┼────────┼──────────┼─────────┼─────────┤ 1001 Hopper 2020-11-1917.4 5.8  2002 Gosling 2020-10-08 6.8 3.4  3003 Kemeny 2020-11-12   4004 Wirth 2020-11-0515.4 7.7  5005 Kurtz     └──────────┴────────┴──────────┴─────────┴─────────┘ 
import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.DoubleSummaryStatistics; import java.util.List; public final class MergeAndAggregateDatasets { public static void main(String[] args) { List<Patient> patients = Arrays.asList( new Patient("1001", "Hopper"), new Patient("4004", "Wirth"), new Patient("3003", "Kemeny"), new Patient("2002", "Gosling"), new Patient("5005", "Kurtz") ); List<Visit> visits = Arrays.asList(  new Visit("2002", "2020-09-10", 6.8),  new Visit("1001", "2020-09-17", 5.5),  new Visit("4004", "2020-09-24", 8.4),  new Visit("2002", "2020-10-08", null),  new Visit("1001", "" , 6.6),  new Visit("3003", "2020-11-12", null),  new Visit("4004", "2020-11-05", 7.0),  new Visit("1001", "2020-11-19", 5.3) );  Collections.sort(patients, Comparator.comparing(Patient::patientID)); System.out.println("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |");  for ( Patient patient : patients ) {   List<Visit> patientVisits = visits.stream().filter( v -> v.visitID == patient.patientID() ).toList();  String lastVisit = patientVisits.stream()  .map( v -> v.visitDate ).max(Comparator.naturalOrder()).orElseGet( () -> " None " );   DoubleSummaryStatistics statistics = patientVisits.stream()  .filter( v -> v.score != null ).mapToDouble(Visit::score).summaryStatistics();   double scoreSum = statistics.getSum();  double scoreAverage = statistics.getAverage();   String patientDetails = String.format("%12s%11s%13s%12.2f%12.2f",   patient.patientID, patient.lastName, lastVisit, scoreSum, scoreAverage);   System.out.println(patientDetails);  }  private static record Patient(String patientID, String lastName) {};  private static record Visit(String visitID, String visitDate, Double score) {}; }  } 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | 1001 Hopper 2020-11-19 17.40 5.80 2002 Gosling 2020-10-08 6.80 6.80 3003 Kemeny 2020-11-12 0.00 0.00 4004 Wirth 2020-11-05 15.40 7.70 5005 Kurtz None 0.00 0.00 
Works with: NodeJS 16.14.2
Translation of: Java
class Patient {  constructor(patientID, lastName) {  this.patientID = patientID;  this.lastName = lastName;  } }  class Visit {  constructor(visitID, visitDate, score) {  this.visitID = visitID;  this.visitDate = visitDate;  this.score = score;  } }  function main() {  const patients = [  new Patient("1001", "Hopper"),  new Patient("4004", "Wirth"),  new Patient("3003", "Kemeny"),  new Patient("2002", "Gosling"),  new Patient("5005", "Kurtz"),  ];   const visits = [  new Visit("2002", "2020-09-10", 6.8),  new Visit("1001", "2020-09-17", 5.5),  new Visit("4004", "2020-09-24", 8.4),  new Visit("2002", "2020-10-08", null),  new Visit("1001", "", 6.6),  new Visit("3003", "2020-11-12", null),  new Visit("4004", "2020-11-05", 7.0),  new Visit("1001", "2020-11-19", 5.3),  ];   // Sort patients by patientID  patients.sort((a, b) => a.patientID.localeCompare(b.patientID));   console.log("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |");   patients.forEach((patient) => {  const patientVisits = visits.filter((v) => v.visitID === patient.patientID);   // Find the last visit date  const validVisitDates = patientVisits  .map((v) => v.visitDate)  .filter((date) => date !== "");  const lastVisit = validVisitDates.length > 0  ? validVisitDates.reduce((latest, date) => date > latest ? date : latest, "")  : " None ";   // Calculate score sum and average  const validScores = patientVisits  .map((v) => v.score)  .filter((score) => score !== null);  const scoreSum = validScores.reduce((sum, score) => sum + score, 0);  const scoreAverage = validScores.length > 0  ? scoreSum / validScores.length  : 0;   // Format the output  const patientDetails = [  patient.patientID.padEnd(12),  patient.lastName.padEnd(11),  lastVisit.padEnd(13),  scoreSum.toFixed(2).padStart(12),  scoreAverage.toFixed(2).padStart(12),  ].join("");   console.log(patientDetails);  }); }  main(); 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | 1001 Hopper 2020-11-19 17.40 5.80 2002 Gosling 2020-10-08 6.80 6.80 3003 Kemeny 2020-11-12 0.00 0.00 4004 Wirth 2020-11-05 15.40 7.70 5005 Kurtz None 0.00 0.00 


Works with: jq

Works with gojq, the Go implementation of jq

In the context of jq, a relational dataset "table" is naturally represented as an array of JSON objects, each representing a row in the table. When displaying such a table, we will only show the constituent rows. Null values will be represented by JSON's `null`.

Ingesting CSV data

# objectify/1 takes an array of atomic values as inputs, and packages # these into an object with keys specified by the "headers" array and # values obtained by trimming string values, replacing empty strings # by null, and converting strings to numbers if possible. def objectify(headers): def tonumberq: tonumber? // .; def trimq: if type == "string" then sub("^ +";"") | sub(" +$";"") else . end; def tonullq: if . == "" then null else . end; . as $in | reduce range(0; headers|length) as $i ({}; .[headers[$i]] = ($in[$i] | trimq | tonumberq | tonullq) ); def csv2jsonHelper: .[0] as $headers | reduce (.[1:][] | select(length > 0) ) as $row ([]; . + [ $row|objectify($headers) ]);

Aggregation functions

# output {LAST_VISIT} def LAST_VISIT($patient_id): {LAST_VISIT: (map(select( .PATIENT_ID == $patient_id).VISIT_DATE) | max)}; # output {SCORE_SUM, SCORE_AVG} def SCORE_SUMMARY($patient_id): map(select( .PATIENT_ID == $patient_id).SCORE) | {SCORE_SUM: add, count: length} | {SCORE_SUM, SCORE_AVG: (if .SCORE_SUM and .count > 0 then .SCORE_SUM/.count else null end)};

The task

# Read the two tables: INDEX($patients | [splits("\n")] | map(split(",")) | csv2jsonHelper[]; .PATIENT_ID) as $patients | ($visits | [splits("\n")] | map(split(",")) | csv2jsonHelper) as $visits # Construct the new table: | $visits | map(.PATIENT_ID as $PATIENT_ID | {$PATIENT_ID} + ($visits | {LASTNAME: $patients[$PATIENT_ID|tostring]} + LAST_VISIT($PATIENT_ID) + SCORE_SUMMARY($PATIENT_ID))) # ... but display it as a sequence of JSON objects | .[]

Invocation

 jq -Rnc --rawfile patients patients.csv --rawfile visits visits.csv -f program.jq 
Output:
{"PATIENT_ID":2002,"LASTNAME":{"PATIENT_ID":2002,"LASTNAME":"Gosling"},"LAST_VISIT":"2020-10-08","SCORE_SUM":6.8,"SCORE_AVG":3.4} {"PATIENT_ID":1001,"LASTNAME":{"PATIENT_ID":1001,"LASTNAME":"Hopper"},"LAST_VISIT":"2020-11-19","SCORE_SUM":17.4,"SCORE_AVG":5.8} {"PATIENT_ID":4004,"LASTNAME":{"PATIENT_ID":4004,"LASTNAME":"Wirth"},"LAST_VISIT":"2020-11-05","SCORE_SUM":15.4,"SCORE_AVG":7.7} {"PATIENT_ID":2002,"LASTNAME":{"PATIENT_ID":2002,"LASTNAME":"Gosling"},"LAST_VISIT":"2020-10-08","SCORE_SUM":6.8,"SCORE_AVG":3.4} {"PATIENT_ID":1001,"LASTNAME":{"PATIENT_ID":1001,"LASTNAME":"Hopper"},"LAST_VISIT":"2020-11-19","SCORE_SUM":17.4,"SCORE_AVG":5.8} {"PATIENT_ID":3003,"LASTNAME":{"PATIENT_ID":3003,"LASTNAME":"Kemeny"},"LAST_VISIT":"2020-11-12","SCORE_SUM":null,"SCORE_AVG":null} {"PATIENT_ID":4004,"LASTNAME":{"PATIENT_ID":4004,"LASTNAME":"Wirth"},"LAST_VISIT":"2020-11-05","SCORE_SUM":15.4,"SCORE_AVG":7.7} {"PATIENT_ID":1001,"LASTNAME":{"PATIENT_ID":1001,"LASTNAME":"Hopper"},"LAST_VISIT":"2020-11-19","SCORE_SUM":17.4,"SCORE_AVG":5.8} 
Translation of: Python
using CSV, DataFrames, Statistics # load data from csv files #df_patients = CSV.read("patients.csv", DataFrame) #df_visits = CSV.read("visits.csv", DataFrame) # create DataFrames from text that is hard coded, so use IOBuffer(String) as input str_patients = IOBuffer("""PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz """) df_patients = CSV.read(str_patients, DataFrame) str_visits = IOBuffer("""PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 """) df_visits = CSV.read(str_visits, DataFrame) # merge on PATIENT_ID, using an :outer join or we lose Kurtz, who has no data, sort by ID df_merge = sort(join(df_patients, df_visits, on="PATIENT_ID", kind=:outer), (:PATIENT_ID,)) fnonmissing(a, f) = isempty(a) ? [] : isempty(skipmissing(a)) ? a[1] : f(skipmissing(a)) # group by patient id / last name and then aggregate to get latest visit and mean score df_result = by(df_merge, [:PATIENT_ID, :LASTNAME]) do df  DataFrame(LATEST_VISIT = fnonmissing(df[:VISIT_DATE], maximum),  SUM_SCORE = fnonmissing(df[:SCORE], sum),  MEAN_SCORE = fnonmissing(df[:SCORE], mean)) end println(df_result) 
Output:
5×5 DataFrame │ Row │ PATIENT_ID │ LASTNAME │ LATEST_VISIT │ SUM_SCORE │ MEAN_SCORE │ │ │ Int64? │ String? │ Dates.Date? │ Float64? │ Float64? │ ├─────┼────────────┼──────────┼──────────────┼───────────┼────────────┤ │ 1 │ 1001 │ Hopper │ 2020-11-19 │ 17.4 │ 5.8 │ │ 2 │ 2002 │ Gosling │ 2020-10-08 │ 6.8 │ 6.8 │ │ 3 │ 3003 │ Kemeny │ 2020-11-12 │ missing │ missing │ │ 4 │ 4004 │ Wirth │ 2020-11-05 │ 15.4 │ 7.7 │ │ 5 │ 5005 │ Kurtz │ missing │ missing │ missing │ 
Works with: UCB Logo version 6.2.5

UCBLogo has no built-in support for generic CSV files.

to read.simple.csv :in local [line list] make "list [] openread :in setread :in while [not eofp] [ make "line subst.comma readword make "line parse map [ifelse equalp ? ", ["\ ] [?]] :line make "list fput :line :list ] close :in setread [] output reverse :list end to subst.comma :string local [out k] make "k 1 make "out " foreach :string [ ifelse equalp ? ", ~ [ifelse equalp :k 1 ~ [make "out word :out ? make "k sum :k 1] ~ [make "out word :out "|missing,| make "k 1]] ~ [make "out word :out ? make "k 1] ] output ifelse equalp last :out ", [word :out "missing] [:out] end to aggregate :patients :visits if emptyp :patients [output []] output fput aggregate.helper first :patients :visits aggregate bf :patients :visits end to aggregate.helper :patient :visits [s 0] [c 0] [m "] ;m for max date. s for sum of scores. c number of scores if emptyp :visits [ output (se :patient ifelse equalp :m " ["missing] [:m] ~ ifelse equalp :s 0 ["missing] [:s] ~ ifelse equalp :s 0 ["missing] [quotient :s :c]) ] output ifelse equalp first :patient first first :visits [ (aggregate.helper :patient bf :visits ~ ifelse equalp last first :visits "missing [:s][sum :s last first :visits] ~ ifelse equalp last first :visits "missing [:c][sum :c 1] ~ ifelse equalp first bf first :visits "missing [:m] ~ [ifelse beforep :m first bf first :visits [first bf first :visits] [:m]])] ~ [(aggregate.helper :patient bf :visits :s :c :m)] end to mergesort :list localmake "half split (count :list) / 2 [] :list if empty? first :half [output :list] output merge mergesort first :half mergesort last :half end to split :size :front :list if :size < 1 [output list :front :list] output split :size-1 (lput first :list :front) (butfirst :list) end to merge :small :large if empty? :small [output :large] ifelse lessequal? first first :small first first :large ~ [output fput first :small merge butfirst :small :large] ~ [output fput first :large merge butfirst :large :small] end to csv.write :list :out openwrite :out setwrite :out csv.write.lines :list close :out setwrite [] end to csv.write.lines :list if emptyp :list [stop] print csv.write.fields first :list csv.write.lines bf :list end to csv.write.fields :line if equalp count :line 1 [output first :line] output (word first :line ", csv.write.fields bf :line) end
make "patients bf read.simple.csv "patients.csv make "visits bf read.simple.csv "visits.csv make "out mergesort aggregate :patients :visits make "out fput [patient_id lastname max_date score_sum score_mean] :out csv.write :out "out.csv
Output:
Output: patient_id lastname max_date score_sum score_mean 1001 Hopper 2020-11-19 17.4 5.8 2002 Gosling 2020-10-08 6.8 6.8 3003 Kemeny 2020-11-12 missing missing 4004 Wirth 2020-11-05 15.4 7.7 5005 Kurtz missing missing missing 


a = ImportString["PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz", "CSV"]; b = ImportString["PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3", "CSV"]; a = <|a[[1, 1]] -> #1, a[[1, 2]] -> #2|> & @@@ Rest[a]; b = <|b[[1, 1]] -> #1, b[[1, 2]] -> If[#2 != "", DateObject[#2], Missing[]], b[[1, 3]] -> If[#3 =!= "", #3, Missing[]]|> & @@@ Rest[b]; j = JoinAcross[a, b, Key["PATIENT_ID"], "Outer"]; gr = GroupBy[j, #["PATIENT_ID"] &]; <|"PATIENT_ID" -> #[[1, "PATIENT_ID"]],   "LASTNAME" -> #[[1, "LASTNAME"]],   "VISIT_DATE" -> If[DeleteMissing[#[[All, "VISIT_DATE"]]] =!= {}, Max@DeleteMissing[#[[All, "VISIT_DATE"]]], Missing[]],   "SCORE_SUM" -> If[DeleteMissing@#[[All, "SCORE"]] =!= {}, Total@DeleteMissing@#[[All, "SCORE"]], Missing[]],   "SCORE_AVG" -> If[DeleteMissing@#[[All, "SCORE"]] =!= {}, Mean@DeleteMissing@#[[All, "SCORE"]], Missing[]]|> & /@   gr // Dataset 
Output:
PATIENT_ID	LASTNAME	VISIT_DATE	SCORE_SUM	SCORE_AVG 2002	2002	Gosling	Thu 8 Oct 2020	6.8	6.8 1001	1001	Hopper	Thu 19 Nov 2020	17.4	5.8 4004	4004	Wirth	Thu 5 Nov 2020	15.4	7.7 3003	3003	Kemeny	Thu 12 Nov 2020	\[LongDash]	\[LongDash] 5005	5005	Kurtz	\[LongDash]	\[LongDash]	\[LongDash]
:- module rosetta. :- interface. :- import_module io. :- pred main(io::di, io::uo) is det. :- implementation. :- import_module list, int, float, string, maybe, solutions. :- pred patient(int::out, string::out) is multi. patient(1001, "Hopper"). patient(4004, "Wirth"). patient(3003, "Kemeny"). patient(2002, "Gosling"). patient(5005, "Kurtz"). :- func nan = float. nan = det_to_float("NaN"). :- type maybe_date ---> date(year::int, month::int, day::int); no. :- pred visit(int::out, maybe_date::out, float::out) is multi. visit(2002, date(2020,09,10), 6.8). visit(1001, date(2020,09,17), 5.5). visit(4004, date(2020,09,24), 8.4). visit(2002, date(2020,10,08), nan). visit(1001, no, 6.6). visit(3003, date(2020,11,12), nan). visit(4004, date(2020,11,05), 7.0). visit(1001, date(2020,11,19), 5.3). %% Utilities :- pred bag_aggr(pred(T)::(pred(out) is nondet), pred(T,U,U)::pred(in,in,out) is det, U::in, U::out) is det. :- pred bag_count(pred(T)::(pred(out) is nondet), int::out) is det. :- pred bag_sum(pred(float)::(pred(out) is nondet), float::out) is det. :- pred bag_avg(pred(float)::(pred(out) is nondet), float::out) is det. :- pred bag_max(pred(T)::(pred(out) is nondet), T::in, T::out) is det. :- pred bag_max_date(pred(maybe_date)::(pred(out) is nondet), maybe_date::out) is det. bag_aggr(Predicate, Aggregator, Initial, Result) :- promise_equivalent_solutions[Result] ( unsorted_aggregate(Predicate, Aggregator, Initial, Result)). bag_count(Predicate, Count) :- bag_aggr(Predicate, (pred(_X::in,Y::in,Z::out) is det :- Z = Y+1), 0, Count). bag_sum(Predicate, Sum) :- bag_aggr(Predicate, (pred(X::in,Y::in,Z::out) is det :- Z = X+Y), 0.0, Sum). bag_avg(Predicate, Avg) :- bag_count(Predicate, N), bag_sum(Predicate, Sum), (if N = 0 then Avg = nan else Avg = Sum/float(N)). bag_max(Predicate, Initial, Max) :- bag_aggr(Predicate, (pred(X::in,Y::in,Z::out) is det :- compare(R,X,Y), (if R = (>) then Z = X else Z = Y)), Initial, Max). bag_max_date(Predicate, MaxDate) :- bag_max(Predicate, date(0,0,0), MaxDate1), (if MaxDate1 = date(0,0,0) then MaxDate = no else MaxDate = MaxDate1). main(!IO) :- print_line("{Id, Lastname, SumScores, AvgScores, MaxDate}:", !IO), aggregate((pred({Id,Lastname,Sum,Avg,MaxDate}::out) is nondet :- patient(Id,Lastname), Scores = (pred(Score::out) is nondet :- visit(Id,_,Score), \+is_nan(Score)), bag_avg(Scores, Avg), bag_sum(Scores, Sum), Dates = (pred(Date::out) is nondet :- visit(Id,Date,_), Date\=no), bag_max_date(Dates, MaxDate)), print_line, !IO). 
Output:
{Id, Lastname, SumScores, AvgScores, MaxDate}: {1001, "Hopper", 17.4, 5.8, date(2020, 11, 19)} {2002, "Gosling", 6.8, 6.8, date(2020, 10, 8)} {3003, "Kemeny", 0.0, nan, date(2020, 11, 12)} {4004, "Wirth", 15.4, 7.7, date(2020, 11, 5)} {5005, "Kurtz", 0.0, nan, no}

CSV files and tables

import algorithm, parsecsv, strformat, strutils, tables const NoValue = -1.0 type  Names = OrderedTable[Positive, string] # Mapping id -> last name.  Visit = tuple[date: string; score: float]  Visits = Table[Positive, seq[Visit]] # Mapping id -> list of visits. proc readNames(path: string): Names =  ## Read the records (id, lastname) from the CSV file and fill a Names table.  var parser: CsvParser  parser.open(path)  parser.readHeaderRow()  while parser.readRow():  let id = parser.row[0].parseInt  let name = parser.row[1]  result[id] = name proc readVisits(path: string): Visits =  ## Read the records (id, date, score) from the CSV file and fill a Visits table.  var parser: CsvParser  parser.open(path)  parser.readHeaderRow()  while parser.readRow():  let id = parser.row[0].parseInt  let date = parser.row[1]  let score = if parser.row[2].len == 0: NoValue else: parser.row[2].parseFloat  result.mgetOrPut(id, @[]).add (date, score) var  names = readNames("patients1.csv")  visits = readVisits("patients2.csv") names.sort(system.cmp) echo "| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |" for (id, name) in names.pairs:  let visitList = visits.getOrDefault(id).sorted()  let lastVisit = if visitList.len == 0: "" else: visitList[^1].date  var sum = 0.0  var count = 0  for visit in visitList:  if visit.score != NoValue:  sum += visit.score  inc count  let scoreSum = if count == 0: "" else: &"{sum:>4.1f}"  let scoreAvg = if count == 0: "" else: &"{sum / count.toFloat: >4.2f}"  echo &"| {id:^10} | {name:^10} | {lastVisit:^10} | {scoreSum:>7} | {scoreAvg:>6} |" 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | |

CSV files and SQLite3

Translation of: Python

We use the high level standard library "db_sqlite" rather than the low level one "sqlite3".

import parseCsv, db_sqlite, sequtils, strutils const FNames = ["patients1.csv", "patients2.csv"] proc createTableHeaders(conn: DbConn) =  conn.exec(sql"CREATE TABLE names(PATIENT_ID INT, LASTNAME TEXT);")  conn.exec(sql"CREATE TABLE visits(PATIENT_ID INT, VISIT_DATE DATE, SCORE NUMERIC(4,1));") proc fillTables(dbc: DbConn) =  for idx, fname in FNames:  dbc.exec(sql"BEGIN")  var parser: CsvParser  parser.open(fname)  parser.readHeaderRow()  while parser.readRow():  if idx == 0: # "names" table.  dbc.exec(sql"INSERT INTO names VALUES (?, ?);", parser.row)  else: # "visits" table  dbc.exec(sql"INSERT INTO visits VALUES (?, ?, ?);", parser.row)  dbc.exec(sql"COMMIT") proc joinTablesAndGroup(dbc: DbConn): seq[Row] =  dbc.exec(sql"""CREATE TABLE answer AS  SELECT  names.PATIENT_ID,  names.LASTNAME,  MAX(VISIT_DATE) AS LAST_VISIT,  SUM(SCORE) AS SCORE_SUM,  CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG  FROM  names  LEFT JOIN visits  ON visits.PATIENT_ID = names.PATIENT_ID  GROUP BY  names.PATIENT_ID,  names.LASTNAME  ORDER BY  names.PATIENT_ID;""")  result = dbc.getAllRows(sql"SELECT * FROM ANSWER") # Build the database and execute the request to get the result. let dbc = open(":memory:", "", "", "") dbc.createTableHeaders() dbc.fillTables() let result = dbc.joinTablesAndGroup() dbc.close() # Print the result. echo "| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |" for row in result:  echo "| " & row.mapit(it.center(10)).join(" | ") & '|' 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 3.4 | | 3003 | Kemeny | 2020-11-12 | 0.0 | 0 | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | | | |
constant patients_txt = split(""" PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz""",'\n'), visits_txt = split(""" PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3""",'\n') --or, assuming these files contain exactly the same actual raw text, -- and the use of GT_LF_STRIPPED is instead of above split('\n'): --constant patients_txt = get_text("patients.csv",GT_LF_STRIPPED), -- visits_txt = get_text("visits.csv",GT_LF_STRIPPED) function sap(string s) return split(s,',',false)&{"",0,0} end function function svp(string s) return split(s,',',false) end function sequence patient_data = sort(apply(patients_txt[2..$],sap)), visit_data = sort_columns(apply(visits_txt[2..$],svp),{1,-2}) visit_data = append(visit_data,{"","","0"}) -- (add a sentinel) string last_id = "",id,name,dt,scstr,avstr atom score,score_total,average integer visit_count = 0, pdx = 1 for i=1 to length(visit_data) do {id,dt,scstr} = visit_data[i] score = iff(scstr=""?0:scanf(scstr,"%f")[1][1]) if id!=last_id then if visit_count then average = score_total/visit_count patient_data[pdx][4..5] = {score_total,average} end if if i=length(visit_data) then exit end if -- (sentinel) score_total = score visit_count = (score!=0) while id!=patient_data[pdx][1] do pdx += 1 end while patient_data[pdx][3] = dt last_id = id elsif score!=0 then score_total += score visit_count += 1 end if end for printf(1,"| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |\n") for i=1 to length(patient_data) do {id,name,dt,score,average} = patient_data[i] scstr = iff(score=0?"":sprintf("%4.1f",score)) avstr = iff(average=0?"":sprintf("%4.2f",average)) printf(1,"| %-10s |  %-7s | %10s |  %-9s | %-9s |\n", {id,name,dt,scstr,avstr}) end for 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | | 

Weird requirement: "Use the most common libraries only when built-in functionality is not sufficient." Not even a "use strict;" :)

#!/usr/bin/perl my $fmt = '| %-11s' x 5 . "|\n"; printf $fmt, qw( PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG); my ($names, $visits) = do { local $/; split /^\n/m, <DATA> }; my %score; for ( $visits =~ /^\d.*/gm )  {  my ($id, undef, $score) = split /,/;  $score{$id} //= ['', ''];  $score and $score{$id}[0]++, $score{$id}[1] += $score;  } for ( sort $names =~ /^\d.*/gm )  {  my ($id, $name) = split /,/;  printf $fmt, $id, $name, ( sort $visits =~ /^$id,(.*?),/gm, '' )[-1],  $score{$id}[0]  ? ( $score{$id}[1], $score{$id}[1] / $score{$id}[0])  : ('', '');  } __DATA__ PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | | | | 

Implemented using SWI Prolog:

patient(1001,'Hopper'). patient(4004,'Wirth'). patient(3003,'Kemeny'). patient(2002,'Gosling'). patient(5005,'Kurtz'). visit(2002,'2020-09-10',6.8). visit(1001,'2020-09-17',5.5). visit(4004,'2020-09-24',8.4). visit(2002,'2020-10-08',nan). visit(1001,'',6.6). visit(3003,'2020-11-12',nan). visit(4004,'2020-11-05',7.0). visit(1001,'2020-11-19',5.3). summaryDates(Id, Lastname, LastDate) :- aggregate(max(Ts), Score^Date^(visit(Id, Date, Score), Date \= '', parse_time(Date, iso_8601, Ts)), MaxTs), format_time(atom(LastDate), '%Y-%m-%d', MaxTs), patient(Id,Lastname). summaryScores(Id, Lastname, Sum, Mean) :- aggregate(r(sum(Score),count), Date^(visit(Id, Date, Score), Score \= nan), r(Sum,Count)), patient(Id,Lastname), Mean is Sum/Count. test :- summaryDates(Id, Lastname, LastDate), writeln(summaryDates(Id, Lastname, LastDate)), fail. test :- summaryScores(Id, Lastname, ScoreSum, ScoreMean), writeln(summaryScores(Id, Lastname, ScoreSum, ScoreMean)), fail. 
Output:
summaryDates(1001,Hopper,2020-11-19) summaryDates(2002,Gosling,2020-10-08) summaryDates(3003,Kemeny,2020-11-12) summaryDates(4004,Wirth,2020-11-05) summaryScores(1001,Hopper,17.4,5.8) summaryScores(2002,Gosling,6.8,6.8) summaryScores(4004,Wirth,15.4,7.7) false.

Implemented using XSB Prolog (which allows for user-defined aggregates):

:- import bagMax/2, bagCount/2, bagSum/2, bagReduce/4 from aggregs. :- import julian_date/7, date_string/3 from iso8601. :- import load_csv/2, add_cvt_type_hook/2 from proc_files. ?- add_cvt_type_hook(date,date_converter(_,_)). date_converter(Atom,Date) :- date_string('YYYY-MM-DD',Date,Atom). :- load_csv('visit.csv',visit(integer,date,float)). :- load_csv('patient.csv',patient(integer,atom)). is_nan(Number) :- X is Number, X =\= Number. summaryDates(Id, Lastname, LastDate) :- bagMax(date_number(Id), LastDateNumber), patient(Id,Lastname), julian_date(LastDateNumber, Y, M, D, _, _, _), date_converter(LastDate, date(Y,M,D)). summaryScores(Id, Lastname, Sum, Mean) :- bagSum(scores(Id), Sum), bagCount(scores(Id), Count), Mean is Sum/Count, patient(Id,Lastname). test :- summaryDates(Id,Lastname,LastDate), writeln(summaryDates(Id,Lastname,LastDate)), fail. test :- summaryScores(Id, Lastname, ScoreSum, ScoreMean), writeln(summaryScores(Id, Lastname, ScoreSum, ScoreMean)), fail. /* Put hilog declarations together */ date_number(Id)(Number) :- visit(Id, date(Y,M,D), _), julian_date(Number, Y, M, D, _, _, _). scores(Id)(Score) :- visit(Id, _, Score), \+is_nan(Score). :- hilog maximum. maximum(X,Y,Z) :- X @> Y -> Z=X ; Z=Y. :- hilog sum. sum(X,Y,Z) :- Z is X+Y. :- hilog successor. successor(X,_Y,Z) :- Z is X+1. 
Structure Person  Name$ EndStructure Structure Visits  Datum$  Score$ EndStructure Structure Merge  Patient.Person  List PVisit.Visits() EndStructure NewMap P.Merge() NewList ID$() If ReadFile(1,"./Data/patients.csv")=0 : End 1 : EndIf header=1 While Not Eof(1)  buf1$=ReadString(1)  If header=1 : header=0 : Continue : EndIf  bufId$=StringField(buf1$,1,",")  P(bufId$)\Patient\Name$=StringField(buf1$,2,",")  AddElement(ID$()) : ID$()=bufId$ Wend CloseFile(1) If ReadFile(2,"./Data/visits.csv")=0 : End 2 : EndIf header=1 While Not Eof(2)  buf1$=ReadString(2)  If header=1 : header=0 : Continue : EndIf   bufId$=StringField(buf1$,1,",")  AddElement(P(bufId$)\PVisit())   P(bufId$)\PVisit()\Datum$=StringField(buf1$,2,",")  P(bufId$)\PVisit()\Score$=StringField(buf1$,3,",") Wend CloseFile(2) If OpenConsole()=0 : End 3 : EndIf SortList(ID$(),#PB_Sort_Ascending) PrintN("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |") ForEach ID$()  Print("| "+LSet(ID$(),11))  Print("| "+LSet(P(ID$())\Patient\Name$,9)+"|")  SortStructuredList(P(ID$())\PVisit(),#PB_Sort_Ascending,OffsetOf(Visits\Datum$),TypeOf(Visits\Datum$))   ForEach P(ID$())\PVisit()   scs.f+ValF(p(ID$())\PVisit()\Score$) : c+Bool(ValF(p(ID$())\PVisit()\Score$))  Next  If LastElement(P(ID$())\PVisit())   sca.f=scs/c   Print(" "+LSet(P(ID$())\PVisit()\Datum$,10)+" |")  Print(RSet(StrF(scs,1),10)+" |")  If Not IsNAN(sca) : Print(RSet(StrF(sca,2),10)+" |") : Else : Print(Space(11)+"|") : EndIf  Else  Print(Space(12)+"|"+Space(11)+"|"+Space(11)+"|")  EndIf   PrintN("") : scs=0 : c=0 Next Input() 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | 0.0 | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | |

Python: Using pandas library

# to install pandas library go to cmd prompt and type: # cd %USERPROFILE%\AppData\Local\Programs\Python\Python38-32\Scripts\ # pip install pandas import pandas as pd # load data from csv files df_patients = pd.read_csv (r'patients.csv', sep = ",", decimal=".") df_visits = pd.read_csv (r'visits.csv', sep = ",", decimal=".") ''' # load data hard coded, create data frames import io str_patients = """PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz """ df_patients = pd.read_csv(io.StringIO(str_patients), sep = ",", decimal=".") str_visits = """PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 """ df_visits = pd.read_csv(io.StringIO(str_visits), sep = ",", decimal=".") ''' # typecast from string to datetime so .agg can 'max' it df_visits['VISIT_DATE'] = pd.to_datetime(df_visits['VISIT_DATE']) # merge on PATIENT_ID df_merge = df_patients.merge(df_visits, on='PATIENT_ID', how='left') # groupby is an intermediate object df_group = df_merge.groupby(['PATIENT_ID','LASTNAME'], as_index=False) # note: you can use 'sum' instead of the lambda function but that returns NaN as 0 (zero) df_result = df_group.agg({'VISIT_DATE': 'max', 'SCORE': [lambda x: x.sum(min_count=1),'mean']}) print(df_result) 
 PATIENT_ID LASTNAME LAST_VISIT SCORE max <lambda_0> mean 0 1001 Hopper 2020-11-19 17.4 5.8 1 2002 Gosling 2020-10-08 6.8 6.8 2 3003 Kemeny 2020-11-12 NaN NaN 3 4004 Wirth 2020-11-05 15.4 7.7 4 5005 Kurtz NaT NaN NaN 

Python: Stdlib csv only

Using only standard libraries and input from csv files.

import csv fnames = 'patients.csv patients_visits.csv'.split() def csv2list(fname): with open(fname) as f: rows = list(csv.reader(f)) return rows patients, visits = data = [csv2list(fname) for fname in fnames] result = [record.copy() for record in patients] result[1:] = sorted(result[1:]) #%% result[0].append('LAST_VISIT') last = {p: vis for p, vis, *score in visits[1:]} for record in result[1:]: p = record[0] record.append(last.get(p, '')) #%% result[0] += ['SCORE_SUM', 'SCORE_AVG'] n = {p: 0 for p, *_ in patients[1:]} tot = n.copy() for record in visits[1:]: p, _, score = record if score: n[p] += 1 tot[p] += float(score) for record in result[1:]: p = record[0] if n[p]: record += [f"{tot[p]:5.1f}", f"{tot[p] / n[p]:5.2f}"] else: record += ['', ''] #%% for record in result: print(f"| {' | '.join(f'{r:^10}' for r in record)} |") 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | |


Python: Stdlib sqlite3 and csv only

Using the csv module just to parse inputs; and the sqlite3 module, (which which is also a standard library that comes with the base Python install), to calculate the output.
(The SQL SELECT statement is modelled on that of the SQL entry elsewhere on this page).

import sqlite3 import csv fnames = 'patients.csv patients_visits.csv'.split() conn = sqlite3.connect(":memory:") #%% def create_table_headers(conn): curs = conn.cursor() curs.execute('''  CREATE TABLE patients(PATIENT_ID INT, LASTNAME TEXT);  ''') curs.execute('''  CREATE TABLE patients_visits(PATIENT_ID INT, VISIT_DATE DATE, SCORE NUMERIC(4,1));  ''') conn.commit() def fill_tables(conn, fnames): curs = conn.cursor() for fname in fnames: with open(fname) as f: tablename = fname.replace('.csv', '') # csvdata = csv.reader(f) header = next(csvdata) fields = ','.join('?' for _ in header) for row in csvdata: row = [(None if r == '' else r) for r in row] curs.execute(f"INSERT INTO {tablename} VALUES ({fields});", row) conn.commit() def join_tables_and_group(conn): curs = conn.cursor() curs.execute(''' CREATE TABLE answer AS  SELECT 	patients.PATIENT_ID, 	patients.LASTNAME, 	MAX(VISIT_DATE) AS LAST_VISIT, 	SUM(SCORE) AS SCORE_SUM, 	CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG  FROM 	patients 	LEFT JOIN patients_visits 	ON patients_visits.PATIENT_ID = patients.PATIENT_ID  GROUP BY 	patients.PATIENT_ID, 	patients.LASTNAME  ORDER BY 	patients.PATIENT_ID;  ''') curs.execute(''' SELECT * FROM answer;  ''') conn.commit() rows = list(curs.fetchall()) headers = tuple(d[0] for d in curs.description) return [headers] + rows create_table_headers(conn) fill_tables(conn, fnames) result = join_tables_and_group(conn) for record in result: print(f"| {' | '.join(f'{str(r):^10}' for r in record)} |") 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 | | 3003 | Kemeny | 2020-11-12 | None | None | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | None | None | None |

R

# load data from csv files # setwd("C:\Temp\csv\") # df_patient <- read.csv(file="patients.csv", header = TRUE, sep = ",") # df_visits <- read.csv(file="visits.csv", header = TRUE, sep = ",", dec = ".", colClasses=c("character","character","numeric")) # load data hard coded, create data frames df_patient <- read.table(text = " PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz ", header = TRUE, sep = ",") # character fields so no need for extra parameters colClasses etc. df_visits <- read.table(text = " PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 ", header = TRUE, dec = ".", sep = ",", colClasses=c("character","character","numeric")) # aggregate visit date and scores df_agg <- data.frame(  cbind(  PATIENT_ID = names(tapply(df_visits$VISIT_DATE, list(df_visits$PATIENT_ID), max, na.rm=TRUE)),  last_visit = tapply(df_visits$VISIT_DATE, list(df_visits$PATIENT_ID), max, na.rm=TRUE),  score_sum = tapply(df_visits$SCORE, list(df_visits$PATIENT_ID), sum, na.rm=TRUE),  score_avg = tapply(df_visits$SCORE, list(df_visits$PATIENT_ID), mean, na.rm=TRUE)  ) )  # merge patients and aggregate dataset # all.x = all the non matching cases of df_patient are appended to the result as well (i.e. 'left join') df_result <- merge(df_patient, df_agg, by = 'PATIENT_ID', all.x = TRUE) print(df_result) 
Output:
 PATIENT_ID LASTNAME last_visit score_sum score_avg 1 1001 Hopper 2020-11-19 17.4 5.8 2 2002 Gosling 2020-10-08 6.8 6.8 3 3003 Kemeny 2020-11-12 0 NaN 4 4004 Wirth 2020-11-05 15.4 7.7 5 5005 Kurtz <NA> <NA> <NA> 
my @names = map { ( <PATIENT_ID LASTNAME> Z=> .list ).hash },  ( 1001, 'Hopper' ),  ( 4004, 'Wirth' ),  ( 3003, 'Kemeny' ),  ( 2002, 'Gosling' ),  ( 5005, 'Kurtz' ), ; my @visits = map { ( <PATIENT_ID VISIT_DATE SCORE> Z=> .list ).hash },  ( 2002, '2020-09-10', 6.8 ),  ( 1001, '2020-09-17', 5.5 ),  ( 4004, '2020-09-24', 8.4 ),  ( 2002, '2020-10-08', Nil ),  ( 1001, Nil , 6.6 ),  ( 3003, '2020-11-12', Nil ),  ( 4004, '2020-11-05', 7.0 ),  ( 1001, '2020-11-19', 5.3 ), ;  my %v = @visits.classify: *.<PATIENT_ID>;  my @result = gather for @names -> %n {  my @p = %v{ %n.<PATIENT_ID> }<>;   my @dates = @p».<VISIT_DATE>.grep: *.defined;  my @scores = @p».< SCORE>.grep: *.defined;   take {  %n,  LAST_VISIT => ( @dates.max if @dates ),  SCORE_AVG => ( @scores.sum/@scores if @scores ),  SCORE_SUM => ( @scores.sum if @scores ),  }; }  my @out_field_names = <PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG>; my @rows = @result.sort(*.<PATIENT_ID>).map(*.{@out_field_names}); say .map({$_ // ''}).fmt('%-10s', ' | ') for @out_field_names, |@rows; 
Output:
PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 3003 | Kemeny | 2020-11-12 | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 5005 | Kurtz | | | 
/* REXX */ patients='patients.csv' l=linein(patients) Parse Var l h1 ',' h2 n=0 idl='' Do n=1 By 1 While lines(patients)>0  l=linein(patients)  Parse Var l id ',' lastname.id  idl=idl id  End n=n-1 /* number of patients */ visits='visits.csv' l=linein(visits) /* skip the header line of this file */  h3='LAST_VISIT' h4='SCORE_SUM' h5='SCORE_AVG' date.='' score.=0 Say '|' h1 '|' h2 '|' h3 '|' h4 '|' h5 '|' Do While lines(visits)>0  l=linein(visits)  Parse Var l id ',' date ',' score  if date>date.id Then date.id=date  If score>'' Then Do  z=score.id.0+1  score.id.z=score  score.id.0=z  End  end idl=wordsort(idl) Do While idl<>''  Parse Var idl id idl  If date.id='' Then date.id=copies(' ',10)  ol='|' left(id,length(h1)) '|' left(lastname.id,length(h2)),  '|' left(date.id,length(h3))  If score.id.0=0 Then Do  ol=ol '|' left(' ',length(h4)) '|',  left(' ',length(h5)) '|'  score_sum=copies(' ',length(h4))  score_avg=copies(' ',length(h4))  End  Else Do  score_sum=0  Do j=1 To score.id.0  score_sum=score_sum+score.id.j  End  score_avg=score_sum/score.id.0  ol=ol '|' left(format(score_sum,2,1),length(h4)) '|',  left(format(score_avg,2,1),length(h5)) '|'  End  Say ol  End Exit wordsort: Procedure /********************************************************************** * Sort the list of words supplied as argument. Return the sorted list **********************************************************************/  Parse Arg wl  wa.=''  wa.0=0  Do While wl<>''  Parse Var wl w wl  Do i=1 To wa.0  If wa.i>w Then Leave  End  If i<=wa.0 Then Do  Do j=wa.0 To i By -1  ii=j+1  wa.ii=wa.j  End  End  wa.i=w  wa.0=wa.0+1  End  swl=''  Do i=1 To wa.0  swl=swl wa.i  End  Return strip(swl) 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | | 5005 | Kurtz | | | |
 %let datefmt=E8601DA10.;  data patient; infile "patient.csv" dsd dlm=','; attrib id length=4 lastname length=$10; input id lastname;  data visit; infile "visit.csv" dsd dlm=','; attrib id length=4 date informat=&datefmt format=&datefmt score length=8; input id date score;  proc sql; select * from (select id, max(date) format=&datefmt as max_date, sum(score) as sum_score,	avg(score) as avg_score from visit group by id) natural right join patient order by id; 
Output:
: id lastname max_date sum_score avg_score : ------------------------------------------------------ : 1001 Hopper 2020-11-19 17.4 5.8 : 2002 Gosling 2020-10-08 6.8 6.8 : 3003 Kemeny 2020-11-12 . . : 4004 Wirth 2020-11-05 15.4 7.7 : 5005 Kurtz . . .

Note: the PATIENT_ID 5005 is missing from the result, because MATCH FILES cannot add rows for missing keys, i.e. AFAIK a "left join" is not possible is SPSS.

* set working directory to location of .csv files CD 'C:\Temp\csv\'. * load patients csv data GET DATA /TYPE=TXT /FILE="patients.csv" /ENCODING='UTF8' /DELCASE=LINE /DELIMITERS="," /QUALIFIER='"' /ARRANGEMENT=DELIMITED /FIRSTCASE=2 /IMPORTCASE=ALL /VARIABLES= PATIENT_ID F5.0 LASTNAME A20 . CACHE. EXECUTE. * sort cases is needed to match files SORT CASES BY PATIENT_ID (A). DATASET NAME Patients WINDOW=FRONT. * load visits csv data GET DATA /TYPE=TXT /FILE="visit.csv" /ENCODING='UTF8' /DELCASE=LINE /DELIMITERS="," /QUALIFIER='"' /ARRANGEMENT=DELIMITED /FIRSTCASE=2 /IMPORTCASE=ALL /VARIABLES= PATIENT_ID F5.0 VISIT_DATE SDATE10 SCORE F4.1 . CACHE. EXECUTE. * sort cases is needed, else match files will raise error "Files out of order" SORT CASES BY PATIENT_ID (A) VISIT_DATE (A). DATASET NAME Visits WINDOW=FRONT. * load visits csv data * merge datasets, one to many, FILE is the 'one', TABLE is 'many' MATCH FILES TABLE = Patients / FILE = Visits /BY PATIENT_ID. EXECUTE. * aggregate visit date and scores, group by and order (A)=ascending or (D)=descending AGGREGATE OUTFILE * /BREAK=PATIENT_ID(A) /last_visit = MAX(VISIT_DATE) /score_avg = MEAN(SCORE) /score_sum = SUM(SCORE).
Output:
| PATIENT_ID | LASTNAME | last_visit | score_sum | score_avg | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.8 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.8 | | 3003 | Kemeny | 2020-11-12 | . | . | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.7 | 
-- drop tables DROP TABLE IF EXISTS tmp_patients; DROP TABLE IF EXISTS tmp_visits; -- create tables CREATE TABLE tmp_patients( PATIENT_ID INT, LASTNAME VARCHAR(20) ); CREATE TABLE tmp_visits( PATIENT_ID INT, VISIT_DATE DATE, SCORE NUMERIC(4,1) ); -- load data from csv files /* -- Note: LOAD DATA LOCAL requires `local-infile` enabled on both the client and server else you get error "#1148 command is not allowed.." LOAD DATA LOCAL INFILE '/home/csv/patients.csv' INTO TABLE `tmp_patients` FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES; LOAD DATA LOCAL INFILE '/home/csv/visits.csv' INTO TABLE `tmp_visits` FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES; */ -- load data hard coded INSERT INTO tmp_patients(PATIENT_ID, LASTNAME) VALUES (1001, 'Hopper'), (4004, 'Wirth'), (3003, 'Kemeny'), (2002, 'Gosling'), (5005, 'Kurtz'); INSERT INTO tmp_visits(PATIENT_ID, VISIT_DATE, SCORE) VALUES (2002, '2020-09-10', 6.8), (1001, '2020-09-17', 5.5), (4004, '2020-09-24', 8.4), (2002, '2020-10-08', NULL), (1001, NULL, 6.6), (3003, '2020-11-12', NULL), (4004, '2020-11-05', 7.0), (1001, '2020-11-19', 5.3); -- join tables and group SELECT p.PATIENT_ID, p.LASTNAME, MAX(VISIT_DATE) AS LAST_VISIT, SUM(SCORE) AS SCORE_SUM, CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG FROM tmp_patients p LEFT JOIN tmp_visits v ON v.PATIENT_ID = p.PATIENT_ID GROUP BY p.PATIENT_ID, p.LASTNAME ORDER BY p.PATIENT_ID; 
Output:
PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG 1001 Hopper 2020-11-19 17.4 5.80 2002 Gosling 2020-10-08 6.8 6.80 3003 Kemeny 2020-11-12 NULL NULL 4004 Wirth 2020-11-05 15.4 7.70 5005 Kurtz NULL NULL NULL 
#lang transd MainModule: { tbl: `1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz`, tbl1: ` 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3`, cols: `@key_PATIENT_ID:Int,  LASTNAME:String,  VISIT_DATE:DateTime,  SCORE:Double,  SCORE_AVG:Double,  NUM_VISITS:Int`, Record : typealias(Tuple<Int DateTime Double>()), _start: (λ (with base TSDBase() (load-table base tbl colNames: cols)  (build-index base "PATIENT_ID")  (with vizs Vector<Record>()   (load-table vizs tbl1 :mixedTypes fieldSep: "," rowSep: "\n" )  (for viz in vizs do  (tsd-query base   :update set:  (lambda PATIENT_ID Int() VISIT_DATE DateTime()   SCORE Double() SCORE_AVG Double() NUM_VISITS Int()  (+= NUM_VISITS 1)  (set VISIT_DATE (get viz 1))   (set SCORE (+ SCORE (get viz 2)))  (set SCORE_AVG (/ SCORE NUM_VISITS)))  where: (lambda PATIENT_ID Int() (eq PATIENT_ID (get viz 0))))  ))    (with cols ["PATIENT_ID","LASTNAME","VISIT_DATE","SCORE","SCORE_AVG"]  (with recs (tsd-query base select: cols   as: [[Int(), String(), DateTime(), Double(), Double()]]  where: (lambda PATIENT_ID Int() true)  )  (for i in cols do (textout width: 10 i "|")) (lout "")  (for rec in recs do   (for-each rec (λ i :Data() (textout width: 10 i "|" )))  (lout "")) )) )) } 
Output:
PATIENT_ID| LASTNAME|VISIT_DATE| SCORE| SCORE_AVG| 1001| Hopper|2020-11-19| 17.4| 5.8| 4004| Wirth| 2020-11-5| 15.4| 7.7| 3003| Kemeny|2020-11-12| 0| 0| 2002| Gosling| 2020-10-8| 6.8| 3.4| 5005| Kurtz| | 0| 0| 
BEGIN; TYPE Date UNION; TYPE DateValid IS {Date POSSREP {year INTEGER, month INTEGER, day INTEGER}}; TYPE DateNone IS {Date POSSREP {}}; TYPE DateUnknown IS {Date POSSREP {}}; END; VAR patient REAL RELATION {id INT, lastname CHAR} KEY {id}; INSERT patient RELATION {TUPLE {id 1001, lastname 'Hopper'}, TUPLE {id 4004, lastname 'Wirth'}, TUPLE {id 3003, lastname 'Kemeny'}, TUPLE {id 2002, lastname 'Gosling'}, TUPLE {id 5005, lastname 'Kurtz'} }; VAR visit REAL RELATION {id INT, date Date, score RATIONAL} KEY {id, date}; INSERT visit RELATION { TUPLE {id 2002, date DateValid(2020,09,10), score 6.8}, TUPLE {id 1001, date DateValid(2020,09,17), score 5.5}, TUPLE {id 4004, date DateValid(2020,09,24), score 8.4}, TUPLE {id 2002, date DateValid(2020,10,08), score NAN}, TUPLE {id 1001, date DateNone(), score 6.6}, TUPLE {id 3003, date DateValid(2020,11,12), score NAN}, TUPLE {id 4004, date DateValid(2020,11,05), score 7.0}, TUPLE {id 1001, date DateValid(2020,11,19), score 5.3} }; ((SUMMARIZE (visit WHERE score>0.0) BY {id}: {sumscore := SUM(score), avgscore := AVG(score)}) UNION (EXTEND (patient {id} MINUS ((visit WHERE score>0.0) {id})): {sumscore := NaN, avgscore := NaN})) JOIN (SUMMARIZE visit BY {id}: {maxdate := MAX(date)} UNION (EXTEND (patient {id} MINUS (visit {id})): {maxdate := DateUnknown()})) JOIN patient 
Output:
id	sumscore	avgscore	maxdate	lastname 1001	17.4	5.8	DateValid(2020, 11, 19)	Hopper 2002	6.8	6.8	DateValid(2020, 10, 8)	Gosling 4004	15.4	7.7	DateValid(2020, 11, 5)	Wirth 3003	NaN	NaN	DateValid(2020, 11, 12)	Kemeny 5005	NaN	NaN	DateUnknown()	Kurtz
Library: Wren-sort
Library: Wren-fmt
import "./fmt" for Fmt import "./sort" for Sort class Patient {  construct new(id, lastName) {  _id = id  _lastName = lastName  if (!__dir) __dir = {}  __dir[id] = lastName  if (!__ids) {  __ids = [id]  } else {  __ids.add(id)  Sort.insertion(__ids)  }  }  id { _id }  lastName { _lastName }  // maps an id to a lastname  static dir { __dir }  // maintains a sorted list of ids  static ids { __ids } } class Visit {  construct new(id, date, score) {  _id = id  _date = date || "0000-00-00"  _score = score  if (!__dir) __dir = {}  if (!__dir[id]) {  __dir[id] = [ [_date], [score] ]  } else {  __dir[id][0].add(_date)  __dir[id][1].add(score)  }  }  id { _id }  date { _date }  score { _score }  // maps an id to lists of dates and scores  static dir { __dir } } class Merge {  construct new(id) {  _id = id  }  id { _id }  lastName { Patient.dir[_id] }  dates { Visit.dir[_id][0] }  scores { Visit.dir[_id][1] }  lastVisit { Sort.merge(dates)[-1] }  scoreSum { scores.reduce(0) { |acc, s| s ? acc + s : acc } }  scoreAvg { scoreSum / scores.count { |s| s } }  static print(merges) {  System.print("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |")  var fmt = "| $d | $-7s | $s | $4s | $4s |"  for (m in merges) {  if (Visit.dir[m.id]) {  var lv = (m.lastVisit != "0000-00-00") ? m.lastVisit : " "  var ss = (m.scoreSum > 0) ? Fmt.f(4, m.scoreSum, 1) : " "  var sa = (!m.scoreAvg.isNan) ? Fmt.f(4, m.scoreAvg, 2) : " "   Fmt.print(fmt, m.id, m.lastName, lv, ss, sa)  } else {  Fmt.print(fmt, m.id, m.lastName, " ", " ", " ")  }  }  } } Patient.new(1001, "Hopper") Patient.new(4004, "Wirth") Patient.new(3003, "Kemeny") Patient.new(2002, "Gosling") Patient.new(5005, "Kurtz") Visit.new(2002, "2020-09-10", 6.8) Visit.new(1001, "2020-09-17", 5.5) Visit.new(4004, "2020-09-24", 8.4) Visit.new(2002, "2020-10-08", null) Visit.new(1001, null , 6.6) Visit.new(3003, "2020-11-12", null) Visit.new(4004, "2020-11-05", 7.0) Visit.new(1001, "2020-11-19", 5.3) var merges = Patient.ids.map { |id| Merge.new(id) }.toList Merge.print(merges) 
Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG | | 1001 | Hopper | 2020-11-19 | 17.4 | 5.80 | | 2002 | Gosling | 2020-10-08 | 6.8 | 6.80 | | 3003 | Kemeny | 2020-11-12 | | | | 4004 | Wirth | 2020-11-05 | 15.4 | 7.70 | | 5005 | Kurtz | | | |