univariate_statistics.hpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. // (C) Copyright Nick Thompson 2018.
  2. // Use, modification and distribution are subject to the
  3. // Boost Software License, Version 1.0. (See accompanying file
  4. // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. #ifndef BOOST_MATH_STATISTICS_UNIVARIATE_STATISTICS_HPP
  6. #define BOOST_MATH_STATISTICS_UNIVARIATE_STATISTICS_HPP
  7. #include <algorithm>
  8. #include <iterator>
  9. #include <tuple>
  10. #include <cmath>
  11. #include <boost/assert.hpp>
  12. namespace boost::math::statistics {
  13. template<class ForwardIterator>
  14. auto mean(ForwardIterator first, ForwardIterator last)
  15. {
  16. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  17. BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the mean.");
  18. if constexpr (std::is_integral<Real>::value)
  19. {
  20. double mu = 0;
  21. double i = 1;
  22. for(auto it = first; it != last; ++it) {
  23. mu = mu + (*it - mu)/i;
  24. i += 1;
  25. }
  26. return mu;
  27. }
  28. else if constexpr (std::is_same_v<typename std::iterator_traits<ForwardIterator>::iterator_category, std::random_access_iterator_tag>)
  29. {
  30. size_t elements = std::distance(first, last);
  31. Real mu0 = 0;
  32. Real mu1 = 0;
  33. Real mu2 = 0;
  34. Real mu3 = 0;
  35. Real i = 1;
  36. auto end = last - (elements % 4);
  37. for(auto it = first; it != end; it += 4) {
  38. Real inv = Real(1)/i;
  39. Real tmp0 = (*it - mu0);
  40. Real tmp1 = (*(it+1) - mu1);
  41. Real tmp2 = (*(it+2) - mu2);
  42. Real tmp3 = (*(it+3) - mu3);
  43. // please generate a vectorized fma here
  44. mu0 += tmp0*inv;
  45. mu1 += tmp1*inv;
  46. mu2 += tmp2*inv;
  47. mu3 += tmp3*inv;
  48. i += 1;
  49. }
  50. Real num1 = Real(elements - (elements %4))/Real(4);
  51. Real num2 = num1 + Real(elements % 4);
  52. for (auto it = end; it != last; ++it)
  53. {
  54. mu3 += (*it-mu3)/i;
  55. i += 1;
  56. }
  57. return (num1*(mu0+mu1+mu2) + num2*mu3)/Real(elements);
  58. }
  59. else
  60. {
  61. auto it = first;
  62. Real mu = *it;
  63. Real i = 2;
  64. while(++it != last)
  65. {
  66. mu += (*it - mu)/i;
  67. i += 1;
  68. }
  69. return mu;
  70. }
  71. }
  72. template<class Container>
  73. inline auto mean(Container const & v)
  74. {
  75. return mean(v.cbegin(), v.cend());
  76. }
  77. template<class ForwardIterator>
  78. auto variance(ForwardIterator first, ForwardIterator last)
  79. {
  80. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  81. BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance.");
  82. // Higham, Accuracy and Stability, equation 1.6a and 1.6b:
  83. if constexpr (std::is_integral<Real>::value)
  84. {
  85. double M = *first;
  86. double Q = 0;
  87. double k = 2;
  88. for (auto it = std::next(first); it != last; ++it)
  89. {
  90. double tmp = *it - M;
  91. Q = Q + ((k-1)*tmp*tmp)/k;
  92. M = M + tmp/k;
  93. k += 1;
  94. }
  95. return Q/(k-1);
  96. }
  97. else
  98. {
  99. Real M = *first;
  100. Real Q = 0;
  101. Real k = 2;
  102. for (auto it = std::next(first); it != last; ++it)
  103. {
  104. Real tmp = (*it - M)/k;
  105. Q += k*(k-1)*tmp*tmp;
  106. M += tmp;
  107. k += 1;
  108. }
  109. return Q/(k-1);
  110. }
  111. }
  112. template<class Container>
  113. inline auto variance(Container const & v)
  114. {
  115. return variance(v.cbegin(), v.cend());
  116. }
  117. template<class ForwardIterator>
  118. auto sample_variance(ForwardIterator first, ForwardIterator last)
  119. {
  120. size_t n = std::distance(first, last);
  121. BOOST_ASSERT_MSG(n > 1, "At least two samples are required to compute the sample variance.");
  122. return n*variance(first, last)/(n-1);
  123. }
  124. template<class Container>
  125. inline auto sample_variance(Container const & v)
  126. {
  127. return sample_variance(v.cbegin(), v.cend());
  128. }
  129. template<class ForwardIterator>
  130. auto mean_and_sample_variance(ForwardIterator first, ForwardIterator last)
  131. {
  132. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  133. BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance.");
  134. // Higham, Accuracy and Stability, equation 1.6a and 1.6b:
  135. if constexpr (std::is_integral<Real>::value)
  136. {
  137. double M = *first;
  138. double Q = 0;
  139. double k = 2;
  140. for (auto it = std::next(first); it != last; ++it)
  141. {
  142. double tmp = *it - M;
  143. Q = Q + ((k-1)*tmp*tmp)/k;
  144. M = M + tmp/k;
  145. k += 1;
  146. }
  147. return std::pair<double, double>{M, Q/(k-2)};
  148. }
  149. else
  150. {
  151. Real M = *first;
  152. Real Q = 0;
  153. Real k = 2;
  154. for (auto it = std::next(first); it != last; ++it)
  155. {
  156. Real tmp = (*it - M)/k;
  157. Q += k*(k-1)*tmp*tmp;
  158. M += tmp;
  159. k += 1;
  160. }
  161. return std::pair<Real, Real>{M, Q/(k-2)};
  162. }
  163. }
  164. template<class Container>
  165. auto mean_and_sample_variance(Container const & v)
  166. {
  167. return mean_and_sample_variance(v.begin(), v.end());
  168. }
  169. // Follows equation 1.5 of:
  170. // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
  171. template<class ForwardIterator>
  172. auto skewness(ForwardIterator first, ForwardIterator last)
  173. {
  174. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  175. using std::sqrt;
  176. BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute skewness.");
  177. if constexpr (std::is_integral<Real>::value)
  178. {
  179. double M1 = *first;
  180. double M2 = 0;
  181. double M3 = 0;
  182. double n = 2;
  183. for (auto it = std::next(first); it != last; ++it)
  184. {
  185. double delta21 = *it - M1;
  186. double tmp = delta21/n;
  187. M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
  188. M2 = M2 + tmp*(n-1)*delta21;
  189. M1 = M1 + tmp;
  190. n += 1;
  191. }
  192. double var = M2/(n-1);
  193. if (var == 0)
  194. {
  195. // The limit is technically undefined, but the interpretation here is clear:
  196. // A constant dataset has no skewness.
  197. return double(0);
  198. }
  199. double skew = M3/(M2*sqrt(var));
  200. return skew;
  201. }
  202. else
  203. {
  204. Real M1 = *first;
  205. Real M2 = 0;
  206. Real M3 = 0;
  207. Real n = 2;
  208. for (auto it = std::next(first); it != last; ++it)
  209. {
  210. Real delta21 = *it - M1;
  211. Real tmp = delta21/n;
  212. M3 += tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
  213. M2 += tmp*(n-1)*delta21;
  214. M1 += tmp;
  215. n += 1;
  216. }
  217. Real var = M2/(n-1);
  218. if (var == 0)
  219. {
  220. // The limit is technically undefined, but the interpretation here is clear:
  221. // A constant dataset has no skewness.
  222. return Real(0);
  223. }
  224. Real skew = M3/(M2*sqrt(var));
  225. return skew;
  226. }
  227. }
  228. template<class Container>
  229. inline auto skewness(Container const & v)
  230. {
  231. return skewness(v.cbegin(), v.cend());
  232. }
  233. // Follows equation 1.5/1.6 of:
  234. // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
  235. template<class ForwardIterator>
  236. auto first_four_moments(ForwardIterator first, ForwardIterator last)
  237. {
  238. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  239. BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the first four moments.");
  240. if constexpr (std::is_integral<Real>::value)
  241. {
  242. double M1 = *first;
  243. double M2 = 0;
  244. double M3 = 0;
  245. double M4 = 0;
  246. double n = 2;
  247. for (auto it = std::next(first); it != last; ++it)
  248. {
  249. double delta21 = *it - M1;
  250. double tmp = delta21/n;
  251. M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
  252. M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
  253. M2 = M2 + tmp*(n-1)*delta21;
  254. M1 = M1 + tmp;
  255. n += 1;
  256. }
  257. return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
  258. }
  259. else
  260. {
  261. Real M1 = *first;
  262. Real M2 = 0;
  263. Real M3 = 0;
  264. Real M4 = 0;
  265. Real n = 2;
  266. for (auto it = std::next(first); it != last; ++it)
  267. {
  268. Real delta21 = *it - M1;
  269. Real tmp = delta21/n;
  270. M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
  271. M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
  272. M2 = M2 + tmp*(n-1)*delta21;
  273. M1 = M1 + tmp;
  274. n += 1;
  275. }
  276. return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
  277. }
  278. }
  279. template<class Container>
  280. inline auto first_four_moments(Container const & v)
  281. {
  282. return first_four_moments(v.cbegin(), v.cend());
  283. }
  284. // Follows equation 1.6 of:
  285. // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
  286. template<class ForwardIterator>
  287. auto kurtosis(ForwardIterator first, ForwardIterator last)
  288. {
  289. auto [M1, M2, M3, M4] = first_four_moments(first, last);
  290. if (M2 == 0)
  291. {
  292. return M2;
  293. }
  294. return M4/(M2*M2);
  295. }
  296. template<class Container>
  297. inline auto kurtosis(Container const & v)
  298. {
  299. return kurtosis(v.cbegin(), v.cend());
  300. }
  301. template<class ForwardIterator>
  302. auto excess_kurtosis(ForwardIterator first, ForwardIterator last)
  303. {
  304. return kurtosis(first, last) - 3;
  305. }
  306. template<class Container>
  307. inline auto excess_kurtosis(Container const & v)
  308. {
  309. return excess_kurtosis(v.cbegin(), v.cend());
  310. }
  311. template<class RandomAccessIterator>
  312. auto median(RandomAccessIterator first, RandomAccessIterator last)
  313. {
  314. size_t num_elems = std::distance(first, last);
  315. BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero length vector is undefined.");
  316. if (num_elems & 1)
  317. {
  318. auto middle = first + (num_elems - 1)/2;
  319. std::nth_element(first, middle, last);
  320. return *middle;
  321. }
  322. else
  323. {
  324. auto middle = first + num_elems/2 - 1;
  325. std::nth_element(first, middle, last);
  326. std::nth_element(middle, middle+1, last);
  327. return (*middle + *(middle+1))/2;
  328. }
  329. }
  330. template<class RandomAccessContainer>
  331. inline auto median(RandomAccessContainer & v)
  332. {
  333. return median(v.begin(), v.end());
  334. }
  335. template<class RandomAccessIterator>
  336. auto gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
  337. {
  338. using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
  339. BOOST_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples.");
  340. std::sort(first, last);
  341. if constexpr (std::is_integral<Real>::value)
  342. {
  343. double i = 1;
  344. double num = 0;
  345. double denom = 0;
  346. for (auto it = first; it != last; ++it)
  347. {
  348. num += *it*i;
  349. denom += *it;
  350. ++i;
  351. }
  352. // If the l1 norm is zero, all elements are zero, so every element is the same.
  353. if (denom == 0)
  354. {
  355. return double(0);
  356. }
  357. return ((2*num)/denom - i)/(i-1);
  358. }
  359. else
  360. {
  361. Real i = 1;
  362. Real num = 0;
  363. Real denom = 0;
  364. for (auto it = first; it != last; ++it)
  365. {
  366. num += *it*i;
  367. denom += *it;
  368. ++i;
  369. }
  370. // If the l1 norm is zero, all elements are zero, so every element is the same.
  371. if (denom == 0)
  372. {
  373. return Real(0);
  374. }
  375. return ((2*num)/denom - i)/(i-1);
  376. }
  377. }
  378. template<class RandomAccessContainer>
  379. inline auto gini_coefficient(RandomAccessContainer & v)
  380. {
  381. return gini_coefficient(v.begin(), v.end());
  382. }
  383. template<class RandomAccessIterator>
  384. inline auto sample_gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
  385. {
  386. size_t n = std::distance(first, last);
  387. return n*gini_coefficient(first, last)/(n-1);
  388. }
  389. template<class RandomAccessContainer>
  390. inline auto sample_gini_coefficient(RandomAccessContainer & v)
  391. {
  392. return sample_gini_coefficient(v.begin(), v.end());
  393. }
  394. template<class RandomAccessIterator>
  395. auto median_absolute_deviation(RandomAccessIterator first, RandomAccessIterator last, typename std::iterator_traits<RandomAccessIterator>::value_type center=std::numeric_limits<typename std::iterator_traits<RandomAccessIterator>::value_type>::quiet_NaN())
  396. {
  397. using std::abs;
  398. using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
  399. using std::isnan;
  400. if (isnan(center))
  401. {
  402. center = boost::math::statistics::median(first, last);
  403. }
  404. size_t num_elems = std::distance(first, last);
  405. BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero-length vector is undefined.");
  406. auto comparator = [&center](Real a, Real b) { return abs(a-center) < abs(b-center);};
  407. if (num_elems & 1)
  408. {
  409. auto middle = first + (num_elems - 1)/2;
  410. std::nth_element(first, middle, last, comparator);
  411. return abs(*middle);
  412. }
  413. else
  414. {
  415. auto middle = first + num_elems/2 - 1;
  416. std::nth_element(first, middle, last, comparator);
  417. std::nth_element(middle, middle+1, last, comparator);
  418. return (abs(*middle) + abs(*(middle+1)))/abs(static_cast<Real>(2));
  419. }
  420. }
  421. template<class RandomAccessContainer>
  422. inline auto median_absolute_deviation(RandomAccessContainer & v, typename RandomAccessContainer::value_type center=std::numeric_limits<typename RandomAccessContainer::value_type>::quiet_NaN())
  423. {
  424. return median_absolute_deviation(v.begin(), v.end(), center);
  425. }
  426. template<class ForwardIterator>
  427. auto interquartile_range(ForwardIterator first, ForwardIterator last)
  428. {
  429. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  430. static_assert(!std::is_integral<Real>::value, "Integer values have not yet been implemented.");
  431. auto m = std::distance(first,last);
  432. BOOST_ASSERT_MSG(m >= 3, "At least 3 samples are required to compute the interquartile range.");
  433. auto k = m/4;
  434. auto j = m - (4*k);
  435. // m = 4k+j.
  436. // If j = 0 or j = 1, then there are an even number of samples below the median, and an even number above the median.
  437. // Then we must average adjacent elements to get the quartiles.
  438. // If j = 2 or j = 3, there are an odd number of samples above and below the median, these elements may be directly extracted to get the quartiles.
  439. if (j==2 || j==3)
  440. {
  441. auto q1 = first + k;
  442. auto q3 = first + 3*k + j - 1;
  443. std::nth_element(first, q1, last);
  444. Real Q1 = *q1;
  445. std::nth_element(q1, q3, last);
  446. Real Q3 = *q3;
  447. return Q3 - Q1;
  448. } else {
  449. // j == 0 or j==1:
  450. auto q1 = first + k - 1;
  451. auto q3 = first + 3*k - 1 + j;
  452. std::nth_element(first, q1, last);
  453. Real a = *q1;
  454. std::nth_element(q1, q1 + 1, last);
  455. Real b = *(q1 + 1);
  456. Real Q1 = (a+b)/2;
  457. std::nth_element(q1, q3, last);
  458. a = *q3;
  459. std::nth_element(q3, q3 + 1, last);
  460. b = *(q3 + 1);
  461. Real Q3 = (a+b)/2;
  462. return Q3 - Q1;
  463. }
  464. }
  465. template<class RandomAccessContainer>
  466. inline auto interquartile_range(RandomAccessContainer & v)
  467. {
  468. return interquartile_range(v.begin(), v.end());
  469. }
  470. }
  471. #endif