| // This module implements a Zipfian distribution generator. |
| // |
| // Based on https://github.com/jonhoo/rust-zipf. |
| |
| use rand::Rng; |
| |
| /// Random number generator that generates Zipf-distributed random numbers using rejection |
| /// inversion. |
| #[derive(Clone, Copy)] |
| pub struct ZipfDistribution { |
| /// Number of elements |
| num_elements: f64, |
| /// Exponent parameter of the distribution |
| exponent: f64, |
| /// `hIntegral(1.5) - 1}` |
| h_integral_x1: f64, |
| /// `hIntegral(num_elements + 0.5)}` |
| h_integral_num_elements: f64, |
| /// `2 - hIntegralInverse(hIntegral(2.5) - h(2)}` |
| s: f64, |
| } |
| |
| impl ZipfDistribution { |
| /// Creates a new [Zipf-distributed](https://en.wikipedia.org/wiki/Zipf's_law) |
| /// random number generator. |
| /// |
| /// Note that both the number of elements and the exponent must be greater than 0. |
| pub fn new(num_elements: usize, exponent: f64) -> Result<Self, ()> { |
| if num_elements == 0 { |
| return Err(()); |
| } |
| if exponent <= 0f64 { |
| return Err(()); |
| } |
| |
| let z = ZipfDistribution { |
| num_elements: num_elements as f64, |
| exponent, |
| h_integral_x1: ZipfDistribution::h_integral(1.5, exponent) - 1f64, |
| h_integral_num_elements: ZipfDistribution::h_integral( |
| num_elements as f64 + 0.5, |
| exponent, |
| ), |
| s: 2f64 |
| - ZipfDistribution::h_integral_inv( |
| ZipfDistribution::h_integral(2.5, exponent) |
| - ZipfDistribution::h(2f64, exponent), |
| exponent, |
| ), |
| }; |
| |
| // populate cache |
| |
| Ok(z) |
| } |
| } |
| |
| impl ZipfDistribution { |
| fn next<R: Rng + ?Sized>(&self, rng: &mut R) -> usize { |
| // The paper describes an algorithm for exponents larger than 1 (Algorithm ZRI). |
| // |
| // The original method uses |
| // H(x) = (v + x)^(1 - q) / (1 - q) |
| // as the integral of the hat function. |
| // |
| // This function is undefined for q = 1, which is the reason for the limitation of the |
| // exponent. |
| // |
| // If instead the integral function |
| // H(x) = ((v + x)^(1 - q) - 1) / (1 - q) |
| // is used, for which a meaningful limit exists for q = 1, the method works for all |
| // positive exponents. |
| // |
| // The following implementation uses v = 0 and generates integral number in the range [1, |
| // num_elements]. This is different to the original method where v is defined to |
| // be positive and numbers are taken from [0, i_max]. This explains why the implementation |
| // looks slightly different. |
| |
| let hnum = self.h_integral_num_elements; |
| |
| loop { |
| use std::cmp; |
| let u: f64 = hnum + rng.random::<f64>() * (self.h_integral_x1 - hnum); |
| // u is uniformly distributed in (h_integral_x1, h_integral_num_elements] |
| |
| let x: f64 = ZipfDistribution::h_integral_inv(u, self.exponent); |
| |
| // Limit k to the range [1, num_elements] if it would be outside |
| // due to numerical inaccuracies. |
| let k64 = x.max(1.0).min(self.num_elements); |
| // float -> integer rounds towards zero, so we add 0.5 |
| // to prevent bias towards k == 1 |
| let k = cmp::max(1, (k64 + 0.5) as usize); |
| |
| // Here, the distribution of k is given by: |
| // |
| // P(k = 1) = C * (hIntegral(1.5) - h_integral_x1) = C |
| // P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2 |
| // |
| // where C = 1 / (h_integral_num_elements - h_integral_x1) |
| if k64 - x <= self.s |
| || u >= ZipfDistribution::h_integral(k64 + 0.5, self.exponent) |
| - ZipfDistribution::h(k64, self.exponent) |
| { |
| // Case k = 1: |
| // |
| // The right inequality is always true, because replacing k by 1 gives |
| // u >= hIntegral(1.5) - h(1) = h_integral_x1 and u is taken from |
| // (h_integral_x1, h_integral_num_elements]. |
| // |
| // Therefore, the acceptance rate for k = 1 is P(accepted | k = 1) = 1 |
| // and the probability that 1 is returned as random value is |
| // P(k = 1 and accepted) = P(accepted | k = 1) * P(k = 1) = C = C / 1^exponent |
| // |
| // Case k >= 2: |
| // |
| // The left inequality (k - x <= s) is just a short cut |
| // to avoid the more expensive evaluation of the right inequality |
| // (u >= hIntegral(k + 0.5) - h(k)) in many cases. |
| // |
| // If the left inequality is true, the right inequality is also true: |
| // Theorem 2 in the paper is valid for all positive exponents, because |
| // the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and |
| // (-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0 |
| // are both fulfilled. |
| // Therefore, f(x) = x - hIntegralInverse(hIntegral(x + 0.5) - h(x)) |
| // is a non-decreasing function. If k - x <= s holds, |
| // k - x <= s + f(k) - f(2) is obviously also true which is equivalent to |
| // -x <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)), |
| // -hIntegralInverse(u) <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)), |
| // and finally u >= hIntegral(k + 0.5) - h(k). |
| // |
| // Hence, the right inequality determines the acceptance rate: |
| // P(accepted | k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2)) |
| // The probability that m is returned is given by |
| // P(k = m and accepted) = P(accepted | k = m) * P(k = m) |
| // = C * h(m) = C / m^exponent. |
| // |
| // In both cases the probabilities are proportional to the probability mass |
| // function of the Zipf distribution. |
| |
| return k; |
| } |
| } |
| } |
| } |
| |
| impl rand::distr::Distribution<usize> for ZipfDistribution { |
| fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> usize { |
| self.next(rng) |
| } |
| } |
| |
| use std::fmt; |
| impl fmt::Debug for ZipfDistribution { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { |
| f.debug_struct("ZipfDistribution") |
| .field("e", &self.exponent) |
| .field("n", &self.num_elements) |
| .finish() |
| } |
| } |
| |
| impl ZipfDistribution { |
| /// Computes `H(x)`, defined as |
| /// |
| /// - `(x^(1 - exponent) - 1) / (1 - exponent)`, if `exponent != 1` |
| /// - `log(x)`, if `exponent == 1` |
| /// |
| /// `H(x)` is an integral function of `h(x)`, the derivative of `H(x)` is `h(x)`. |
| fn h_integral(x: f64, exponent: f64) -> f64 { |
| let log_x = x.ln(); |
| helper2((1f64 - exponent) * log_x) * log_x |
| } |
| |
| /// Computes `h(x) = 1 / x^exponent` |
| fn h(x: f64, exponent: f64) -> f64 { |
| (-exponent * x.ln()).exp() |
| } |
| |
| /// The inverse function of `H(x)`. |
| /// Returns the `y` for which `H(y) = x`. |
| fn h_integral_inv(x: f64, exponent: f64) -> f64 { |
| let mut t: f64 = x * (1f64 - exponent); |
| if t < -1f64 { |
| // Limit value to the range [-1, +inf). |
| // t could be smaller than -1 in some rare cases due to numerical errors. |
| t = -1f64; |
| } |
| (helper1(t) * x).exp() |
| } |
| } |
| |
| /// Helper function that calculates `log(1 + x) / x`. |
| /// A Taylor series expansion is used, if x is close to 0. |
| fn helper1(x: f64) -> f64 { |
| if x.abs() > 1e-8 { x.ln_1p() / x } else { 1f64 - x * (0.5 - x * (1.0 / 3.0 - 0.25 * x)) } |
| } |
| |
| /// Helper function to calculate `(exp(x) - 1) / x`. |
| /// A Taylor series expansion is used, if x is close to 0. |
| fn helper2(x: f64) -> f64 { |
| if x.abs() > 1e-8 { |
| x.exp_m1() / x |
| } else { |
| 1f64 + x * 0.5 * (1f64 + x * 1.0 / 3.0 * (1f64 + 0.25 * x)) |
| } |
| } |