Skip to content

Commit 94b202b

Browse files
authored
Merge pull request #608 from MarkSFrancis/shuf
Added shuf, a way to get a random sample from a large dataset
2 parents 37ef611 + 4f348cc commit 94b202b

File tree

2 files changed

+97
-0
lines changed

2 files changed

+97
-0
lines changed

DIRECTORY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
* [MinimumCostPath](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/MinimumCostPath.js)
7676
* [NumberOfSubsetEqualToGivenSum](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/NumberOfSubsetEqualToGivenSum.js)
7777
* [SieveOfEratosthenes](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/SieveOfEratosthenes.js)
78+
* [Shuf](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/Shuf.js)
7879
* [SudokuSolver](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/SudokuSolver.js)
7980
* [TrappingRainWater](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/TrappingRainWater.js)
8081
* [ZeroOneKnapsack](https://github.com/TheAlgorithms/Javascript/blob/master/Dynamic-Programming/ZeroOneKnapsack.js)

Dynamic-Programming/Shuf.js

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
Given a data set of an unknown size,
3+
Get a random sample in a random order
4+
It's used in data analytics, often as a way to get a small random sample from a data lake or warehouse, or from a large CSV file
5+
*/
6+
function shuf (datasetSource, sampleSize) {
7+
const output = fillBaseSample(datasetSource, sampleSize)
8+
9+
return randomizeOutputFromDataset(datasetSource, output)
10+
}
11+
12+
/**
13+
* Fills the output if possible, with the minimum number of values
14+
* @param {Iterable.<T>} datasetSource The iterable source of data
15+
* @param {number} sampleSize The size of the sample to extract from the dataset
16+
* @returns {Array.<T>} The random sample, as an array
17+
* @template T
18+
*/
19+
function fillBaseSample (datasetSource, sampleSize) {
20+
let filledIndexes = []
21+
let output = new Array(sampleSize)
22+
23+
// Spread data out filling the array
24+
while (true) {
25+
const iterator = datasetSource.next()
26+
if (iterator.done) break
27+
28+
let insertTo = Math.floor(Math.random() * output.length)
29+
while (filledIndexes.includes(insertTo)) {
30+
insertTo++
31+
if (insertTo === output.length) {
32+
insertTo = 0
33+
}
34+
}
35+
output[insertTo] = {
36+
value: iterator.value
37+
}
38+
39+
filledIndexes = [...filledIndexes, insertTo]
40+
41+
if (filledIndexes.length === sampleSize) {
42+
break
43+
}
44+
}
45+
46+
if (filledIndexes.length < output.length) {
47+
// Not a large enough dataset to fill the sample - trim empty values
48+
output = output.filter((_, i) => filledIndexes.includes(i))
49+
}
50+
51+
return output.map((o) => o.value)
52+
}
53+
54+
/**
55+
* Replaces values in the output randomly with new ones from the dataset
56+
* @param {Iterable.<T>} datasetSource The iterable source of data
57+
* @param {Array.<T>} output The output so far, filled with data
58+
* @returns {Array.<T>} The random sample, as an array
59+
* @template T
60+
*/
61+
function randomizeOutputFromDataset (datasetSource, output) {
62+
const newOutput = [...output]
63+
let readSoFar = output.length
64+
65+
while (true) {
66+
const iterator = datasetSource.next()
67+
if (iterator.done) break
68+
readSoFar++
69+
70+
const insertTo = Math.floor(Math.random() * readSoFar)
71+
if (insertTo < newOutput.length) {
72+
newOutput[insertTo] = iterator.value
73+
}
74+
}
75+
76+
return newOutput
77+
}
78+
79+
const main = () => {
80+
/**
81+
* Generates a random range of data, with values between 0 and 2^31 - 1
82+
* @param {number} length The number of data items to generate
83+
* @returns {Iterable<number>} Random iterable data
84+
*/
85+
function * generateRandomData (length) {
86+
const maxValue = Math.pow(2, 31) - 1
87+
for (let i = 0; i < length; i++) {
88+
yield Math.floor(Math.random() * maxValue)
89+
}
90+
}
91+
92+
const source = generateRandomData(1000)
93+
const result = shuf(source, 10)
94+
console.log(result)
95+
}
96+
main()

0 commit comments

Comments
 (0)