$(STAT_DIR)/split.c.csv $(STAT_DIR)/split.a.csv: override args := bible.txt
.PRECIOUS: $(STAT_DIR)/%.h.csv
-$(STAT_DIR)/%.h.csv: $(STAT_DIR)/%.a.csv
- $(P_AWK) awk -F, 'BEGIN { print "Size,Count" } NR > 1 { a[$$3]++ } \
- END { for (i in a) print i "," a[i] }' $< | sort > $@
+$(STAT_DIR)/%.h.csv: $(STAT_DIR)/%.a.csv hist.awk
+ $(P_AWK) awk -F, -f $(lastword $^) $< > $@
.PHONY: plot
plot: $(graphs)
--- /dev/null
+#!/usr/bin/env awk -F, -f
+
+BEGIN {
+ MAX_SAMPLES = 50
+ # output CSV header
+ print "Size,Scan,No Scan";
+}
+
+NR == 2 {
+ min = max = int($3)
+}
+
+NR > 1 { # skip the input CVS header
+ n = int($3)
+ if (int($6) > 0) { # cell has the NO_SCAN bit
+ no_scan[n]++
+ if (!(n in scan))
+ scan[n] = 0
+ } else { # cell doesn't have the NO_SCAN bit
+ scan[n]++
+ if (!(n in no_scan))
+ no_scan[n] = 0
+ }
+ if (n < min)
+ min = n
+ else if (n > max)
+ max = n
+}
+
+function h(val) {
+ if (val >= 1048576) # 1 M
+ r = sprintf("%uM", val / 1048576)
+ else if (val >= 1024) # 1 K
+ r = sprintf("%uK", val / 1024)
+ else
+ r = sprintf("%u", val)
+ return r
+}
+
+function p(s, ns, o, n) {
+ for (i = 1; i <= n; i++)
+ print o[i] "," s[o[i]] "," ns[o[i]]
+}
+
+END {
+ # reduce the number of elements in the histogram if there are too many
+ if (length(scan) > MAX_SAMPLES) {
+ step = int((max - min) / MAX_SAMPLES) + 1
+ for (i in scan) {
+ i = int(i)
+ for (from = min; from < max; from += step) {
+ to = from + step
+ if ((from <= i) && (i < to)) {
+ j = sprintf("%s-%s", h(from), h(to))
+ scan2[j] += scan[i]
+ no_scan2[j] += no_scan[i]
+ break
+ }
+ }
+ }
+ n = 1
+ for (from = min; from < max; from += step) {
+ v = sprintf("%s-%s", h(from), h(from + step))
+ if (v in scan2)
+ order[n++] = v
+ }
+ }
+ # print output data
+ if (length(scan2)) {
+ p(scan2, no_scan2, order, n)
+ } else {
+ for (i in scan)
+ order[i++] = int(i)
+ n = asort(order)
+ p(scan, no_scan, order, n)
+ }
+}
+