#!/bin/bash
# Quick and dirty script to audit font repartition in a yum package repository
#
# It is slow, it is ugly, and it requires a good network connection

# Function declarations

usage() {
cat >&2 << EOF_USAGE
Usage: $0 <id> <location>
<id>:       identifier of the package repository to check
<location>: location of the package repository to check

Examples:
$0 tmp file:///tmp/rpm
$0 rawhide http://example.com/mirrors/fedora.redhat.com/fedora/linux/development/x86_64/os/

EOF_USAGE
exit 1
}


# FIXME: only extracts info about the first typeface in a TTC file for now
parse_localized_fc_query() {
  field="$1"
  file="$2"
  fieldstring=$(awk -F ':' -v field="$field" \
                    '$1 == "\t"field { print $2 ; exit }' "$file" \
               | sed 's="(s)="=g' | sed 's=" *"=|=g'| sed 's= *" *==g')"|"
  default=$(echo $fieldstring | awk -F "|" '{ print $1 }')
  if $(grep -q "^"$'\t'$field"lang:" "$file") ; then
    langstring=$(awk -F ':' -v field="$field" \
                '$1 == "\t"field"lang" { print $2 ; exit }' "$file" \
                | sed 's="(s)="=g' | sed 's=" *"=|=g'| sed 's= *" *==g')"|"
    # Try to find the English label
    while [ "$langstring" != "" -a \
            "$(echo $langstring | awk -F '|' '{ print $1 }')" != "en" ] ; do
      fieldstring=$(echo "$fieldstring" | sed 's+\([^|]*\)|\(.*\)+\2+g')
      langstring=$(echo "$langstring" | sed 's+\([^|]*\)|\(.*\)+\2+g')
    done
    # We could hide problems by reporting the first label regardless of its
    # language. But this is an audit script — we do not hide problems
    echo "$fieldstring" |  awk -F "|" '{ print $1 }'
    if [ "$fieldstring" == "" ] ; then  echo -ne "\b×" >&2 ; fi
  else
    echo $(echo $fieldstring | awk -F "|" '{ print $1 }')
  fi
}


pretty_indent() {
  fold -s -w $(($(tput cols) - 4)) \
  | while read line ; do echo "    $line" ; done
  echo ""
}


tally() {
t_datafile=$1

t_file=$(cat "$t_datafile" | wc -l)
t_file_size=$(awk -F '|' '{ sum += $14 } END { print sum }' "$t_datafile")
t_file_size=$((t_file_size/(1024*1024)))

t_rpm=$(awk -F '|' '{ print $2 "-" $3 "." $4 }' "$t_datafile" \
        | sort | uniq | wc -l)
t_rpm_size=$(awk -F '|' '{ print $2 "-" $3 "." $4 "|" $5 }' "$t_datafile" \
             | sort | uniq \
            |awk -F '|' '{ sum += $2 } END { print sum }')
t_rpm_size=$((t_rpm_size/(1024*1024)))

t_srpm=$(awk -F '|' '{ print $1 }' "$t_datafile" | sort | uniq | wc -l)

echo "$t_file|$t_rpm|$t_srpm|$t_file_size|$t_rpm_size"
}


summary() {
if [ $(cat "$1" | wc -l) -gt 0 ] ; then
  tally "$1" | awk -F '|' '{ print "⇒ " $1 " file(s) (" $4 " MiB) in " $2 \
        " package(s) (" $5 " MiB) generated from " $3 " source package(s)." }'\
        | pretty_indent
else
  echo "⇒  None!" | pretty_indent
fi
}


# $1 = summary file
# $2 = temporary data file
# $3 = test id
complete_csv_summary() {
  awk -F '|' -v t_datafile="$2" -v testlabel="$3" \
      'FILENAME==t_datafile { fail[$1"|"$2"|"$9] = 1 }
       FILENAME!=t_datafile && FNR==1 {
        max = NF + 1
        header = $1
        for (i = 2 ; i <= max ; i++) {
           header = header "|" $i
         }
        print header testlabel
       }
       FILENAME!=t_datafile && FNR>1 {
         line = $1 "|" $2 "|" $3
         newline = $1
         for (i = 2 ; i <= max ; i++) {
           newline = newline "|" $i
         }
         print newline fail[line]
         processed[line] = 1
       }
       END {
         for ( line in fail ) {
           if ( ! processed[line] ) {
             newline = line
             for (i = 4 ; i <= max ; i++) {
               newline = newline "|"
             }
             print newline fail[line]
           }
         }
       }' "$2" "$1" > tmp/tmp.ccs.csv
  awk 'FNR==1' tmp/tmp.ccs.csv > "$1"
  awk 'FNR>1'  tmp/tmp.ccs.csv | sort -f -t '|' >> "$1"
  rm tmp/tmp.ccs.csv
}


substats() {
ss_datafile="$1"

awk -F '|' '{ print $NF }' "$ss_datafile" | sort | uniq \
  | while read key ; do
    echo -n "$key|"
    awk -F '|' -v key="$key" '$NF==key' "$ss_datafile" > tmp/tmp.ss.csv
    tally tmp/tmp.ss.csv
    rm tmp/tmp.ss.csv
  done
}


stats() {
s_datafile="$1"

summary "$s_datafile"

if [ $(cat "$s_datafile" | wc -l) -gt 0 ] ; then
  (echo "Format|Files|rpm|srpm|Files (MiB)|rpm (MiB)"
  awk -F '|' '$12 != "" { print $0 "|" $12 }' "$s_datafile" > tmp/tmp.s.csv
  substats tmp/tmp.s.csv) | column -t -s '|' | pretty_indent

  (echo "Format|Files|rpm|srpm|Files (MiB)|rpm (MiB)"
  awk -F '|' '{ print $0 "|" $4 }' "$s_datafile" > tmp/tmp.s.csv
  substats tmp/tmp.s.csv)| column -t -s '|' | pretty_indent

  rm tmp/tmp.s.csv
fi
}


list-rpm() {
if [ $(cat "$1" | wc -l) -gt 0 ] ; then
  awk -F '|' '{ if ( $6 == "M" ) print "[" $2 "]" ; \
                            else print     $2     }' "$1" | sort | uniq \
    | awk -F '|' '{ list = list " " $1 } END { print list }' | pretty_indent
fi
}


canonalise_and_invert_string() {
  cais=""
  for token in $(echo "$@" | sed "s=[ \t_\.-]\+= =g" | sed "s=^ ==g" \
                           | sed "s= $==g") ; do
      cais="$token $cais"
  done
  echo " $cais"
}


wws_resolve () {
cais_name=$(canonalise_and_invert_string $1)
result="$cais_name"
patterns=""
for pattern in $2 ; do patterns="$pattern $patterns" ; done
for pattern in $patterns ; do
   pattern=$(canonalise_and_invert_string $pattern)
   tmp=$(echo "$cais_name" | sed "s=$pattern= =i")
   [ "$tmp" != "$cais_name" ] && result="$tmp"
done
result=$(canonalise_and_invert_string "$result" \
         | sed "s=^ ==" | sed "s= $==")
echo "$result"
}


# $1 test id
# $2 "title" of "help"
test_help() {
case "$1" in
  "arch-package")
    case "$2" in
      "title")
        echo "Fonts in arch packages"
        ;;
      "help")
        cat << EOF
Fonts are not arch-specific; please make sure they are deployed in noarch
packages.
EOF
        ;;
    esac
    ;;
  "outside-usr-share-fonts")
    case "$2" in
      "title")
        echo "Fonts deployed outside /usr/share/fonts"
        ;;
      "help")
        cat << EOF
The standard location for font files is under the /usr/share/fonts root
(default fontconfig setting). Please simplify the work of font utilities
and use it exclusively. It is always possible to symlink font files
somewhere else on the filesystem if an application requires it.

If you fear exposing your font files in fontconfig will cause problems,
please work with the fontconfig maintainers to resolve them.
EOF
        ;;
    esac
    ;;
  "mixed-with-non-font-data")
    case "$2" in
      "title")
        echo "Fonts in packages that contain non-font data"
        ;;
      "help")
        cat << EOF
Please do not mix font files with non-font data in packages. Fonts are
usually useful outside of the package that embeds them and should be
installable without pulling in other material.
EOF
        ;;
    esac
    ;;
  "without-rpm-metadata")
    case "$2" in
      "title")
        echo "Fonts in packages that do not declare font metadata"
        ;;
      "help")
        cat << EOF
Font-specific rpm metadata is required for automatic font installation to
work. If you apply our font packaging templates, it will be generated at
package creation time.
EOF
        ;;
    esac
    ;;
  "bad-rpm-naming")
    case "$2" in
      "title")
        echo "Fonts in packages that do not respect font naming conventions"
        ;;
      "help")
        cat << EOF
Please respect font package naming conventions and provide consistent
packages to users. Some scripts may depend on strict package naming.
EOF
        ;;
    esac
    ;;
  "bad-family-naming")
    case "$2" in
      "title")
        echo "Fonts that declare face attributes in family names"
        ;;
      "help")
        cat << EOF
To be properly processed by applications face qualifiers need to be
declared in style names. Some application stacks such as Microsoft WPF will
try to workaround bad font naming with dynamic renaming heuristics¹, but
heuristics are brittle and pose interoperability problems with applications
that do not use them.

If one your font files is listed here please ask its upstream to fix its
naming so it does not need further reprocessing. And in the meanwhile patch
it (if it is available in sfd form) or add a fontconfig rule to your
package to hide the problem².

There may be a few false positives in this test as some common face
qualifiers can be used with a different meaning in family names.

¹ http://blogs.msdn.com/text/attachment/2249036.ashx
  http://blogs.adobe.com/typblography/typotechnica2007/Font%20names.pdf
  http://blogs.adobe.com/typblography/atypi2006/CSS%20&%20OT%2015.pdf
² cf the “fontpackages” remapping template; unfortunately this workaround
won't fix problems for non-fontconfig applications, or when interoperating
with other systems.
EOF
        ;;
    esac
    ;;
  "bad-style-naming")
    case "$2" in
      "title")
        echo "Font faces that declare non-WWS compliant styles"
        ;;
      "help")
        cat << EOF
This WWS-like test checks if font faces use the “Width Weight Slant” naming
convention¹. As noted by Adobe the CSS family model is less than ideal, but
it's a standard and applications expect it².

Since our applications do not workaround bad font naming with dynamic
renaming heuristics, achieving consistent style naming that can be used in
CSS/web oriented applications requires fixing face naming directly in the
font files. For this reason we test font style naming separately from font
family naming, and do not support complex weight abbreviations and
suffixes³.

To pass this test make sure your style names do not include any qualifier
not defined in the WWS whitepaper¹, and that “Width”, “Weight” or “Slant”
are defined only once. Any other qualifier belongs in the font family name.

If one your font files is listed here please ask its upstream to fix its
naming so it does not need further reprocessing. And in the meanwhile patch
it (if it is available in sfd form) or add a fontconfig rule to your
package to hide the problem⁴.

¹ http://blogs.msdn.com/text/attachment/2249036.ashx
  http://blogs.adobe.com/typblography/typotechnica2007/Font%20names.pdf
² http://blogs.adobe.com/typblography/atypi2006/CSS%20&%20OT%2015.pdf
³ As defined in the end of the WWS renaming algorithm described in the
  Microsoft whitepaper.
⁴ cf the “fontpackages” remapping template; unfortunately this workaround
  won't fix problems for non-fontconfig applications, or when
  interoperating with other systems.
EOF
        ;;
    esac
    ;;
  "duplicated-file")
    case "$2" in
      "title")
        echo "Exact font file duplication"
        ;;
      "help")
        cat << EOF
Several packages duplicate font files with the same checksum. This
needlessly wastes resources.
EOF
        ;;
    esac
    ;;
  "duplicated-face-ext")
    case "$2" in
      "title")
        echo "Font faces duplicated by different packages"
        ;;
      "help")
        cat << EOF
Face duplication wastes resources infrastructure and user side.

Very often an upstream that copied some fonts will forget to keep them up
to date, and the duplication will result in the distribution of old buggy
data. Even when some duplicated font faces are a genuine fork with
different features from the original, applications won't be able to select
them reliably because of naming collision.

We should always ship only one version of a font face in the repository,
and use fontconfig or symlinks to share it accross packages.
EOF
        ;;
    esac
    ;;
  "duplicated-face-int")
    case "$2" in
      "title")
        echo "Font faces duplicated within a package"
        ;;
      "help")
        cat << EOF
Face duplication within a package is almost certainly a bug, except for
special symbol font families.

Fonts that were split because of the limitations of legacy font formats
(PCF, Type 1…) should be converted to modern OpenType (TT, CFF or bitmap)
containers.
EOF
        ;;
    esac
    ;;
  "family-mixing")
    case "$2" in
      "title")
        echo "Packages that mix different font families"
        ;;
      "help")
        cat << EOF
Reliable font auto-installation requires shipping only one font family
per font package.

(If you've remapped some font names at the fontconfig level your package
may appear here pending some fontconfig fixes upstream is aware of).
EOF
        ;;
    esac
    ;;
  "font-linking")
    case "$2" in
      "title")
        echo "Font linking"
        ;;
      "help")
        cat << EOF
Symlinking is a way for non-font packages to avoid duplicating font files,
but it is also a symptom of missing or incomplete fontconfig support.
Fontconfig has been our default font system for a long time, and accessing
fonts by other means will cause behaviour inconsistencies and many other
problems (since fontconfig is much more than a file locating library)

Please ask the package upstream to add fontconfig support to their code
(possibly, via a higher-level library such as pango-cairo).
EOF
        ;;
    esac
    ;;
  "broken-symlink")
    case "$2" in
      "title")
        echo "Broken symlinks to font files"
        ;;
      "help")
        cat << EOF
The symlinked font file has moved, been renamed, or the symlink was never
properly set up.

You can avoid the hassle of maintaining font file symlinks by asking the
package upstream to add fontconfig support to their code (possibly, via a
higher-level library such as pango-cairo).
EOF
        ;;
    esac
    ;;
  "rpmlint")
    case "$2" in
      "title")
        echo "Packages with fonts rpmlint errors on"
        ;;
      "help")
        cat << EOF
Check rpmlint output to fix those packages (using the -i flag if you
don't understand it).
EOF
        ;;
    esac
    ;;
  "libmagic")
    case "$2" in
      "title")
        echo "Packages with font files not identified as such by libmagic"
        ;;
      "help")
        cat << EOF
libmagic could not identify some files with font-like extensions in the
package. The files may be malformed and in need of fixing, or they use a
font extension when they should not, or libmagic has a bug (in that case,
please report the problem so it is fixed).
EOF
        ;;
    esac
    ;;
  "fc-query")
    case "$2" in
      "title")
        echo "Packages with font files that fc-query can not parse"
        ;;
      "help")
        cat << EOF
fc-query could not parse some font files in the package. The files may be
malformed and in need of fixing, or fc-query has a bug (in that case,
please report the problem so it is fixed).
EOF
        ;;
    esac
    ;;
  "no-english-metadata")
    case "$2" in
      "title")
        echo "Packages with localized metadata but no English variant"
        ;;
      "help")
        cat << EOF
Some font files in the package declare localized metadata, but do not
include an English variant. They need to be fixed to also declare metadata
in English.
EOF
        ;;
    esac
    ;;
 *)
    echo "Unknown test."
    ;;
esac
}


# $1 this test id
# $2 csv file with the results of this test
# $3 csv file that consolidates the results of all tests
test_wrapup() {

complete_csv_summary "$3" "$2" "$1"
summary "$2"

[ $(cat "$2" | wc -l) -gt 0 ] && \
( echo -n "☛ " ; test_help "$1" help ) | pretty_indent

}


# $1 csv with test results
test_score() {
if [ $(cat "$1" | wc -l) -gt 1 ] ; then
  (
    awk -F '|' '
      FNR==1 {
        max = NF
      }
      FNR>1 {
        id = $1 "|" $2
        fail[id] = 1
        for (i = 4 ; i <= max ; i++) {
          score[id"|"i] = score[id"|"i] + $i
          totalscore[i] = totalscore[i] + $i
        }
      }
      END {
        headerline = "SRPM|RPM"
        for (i = 4 ; i <= max ; i++) {
          if ( totalscore[i] ) { headerline  = headerline "|" i-3 }
        }
        print headerline
        n = asorti(fail)
        for (l = 1; l <= n; l++) {
          newline = fail[l]
          for (c = 4 ; c <= max ; c++) {
            if ( totalscore[c] ) {
              if ( score[fail[l]"|"c] )
                { newline = newline "|" score[fail[l]"|"c] }
              else
                { newline = newline "|‧" }
            }
          }
          print newline
        }
        totalline = " |Total"
        for (i = 4 ; i <= max ; i++) {
          if ( totalscore[i] ) { totalline = totalline "|"  totalscore[i] }
        }
        print totalline
      }' "$1"
  ) | column -t -s '|'

  awk -F '|' '
    FNR==1 {
      max = NF
      for (i = 4 ; i <= max ; i++) {
        legend[i] = i-3 "|" $i }
      }
    FNR>1 {
      for (i = 4 ; i <= max ; i++) {
        totalscore[i] = totalscore[i] + $i
      }
    }
    END {
      for (i = 4 ; i <= max ; i++) {
        if ( totalscore[i] ) { print legend[i] }
      }
    }' "$1" | while read line ; do
      echo ""
      testnumber=$(echo $line | awk -F '|' '{ print $1 }')
      testid=$(echo $line | awk -F '|' '{ print $2 }')
      echo -n "$testnumber. "
      test_help $testid title
      echo ""
      ( echo -n "☛ " ; test_help $testid help ) | pretty_indent
    done

else
  echo "⇒  None!"
fi

}


collect() {
echo "Searching for packages with font metadata…"
repoquery --repofrompath=$REPOID,$REPOURL --repoid=$REPOID \
          --qf "%{sourcerpm}|%{name}|%{epoch}:%{version}-%{release}|%{arch}|%{packagesize}" \
          --whatprovides "font(*)" --quiet \
          | sed 's=^\([^|]\+\?\)\-\([^-|]\+\?\)\-\([^-|]\+\?\)|=\1|=g' \
          | sort | uniq > "$FPL"

echo "Searching for packages that include files with common font extensions…"
repoquery --repofrompath=$REPOID,$REPOURL --repoid=$REPOID \
          --qf "%{sourcerpm}|%{name}|%{epoch}:%{version}-%{release}|%{arch}|%{packagesize}" \
          -f '*.ttf' -f '*.otf' -f '*.ttc' \
          -f '*.pfb' -f '*.pfa' \
          -f '*.pcf.gz' -f '*.pcf' -f '*.bdf' --quiet \
          | sed 's=^\([^|]\+\?\)\-\([^-|]\+\?\)\-\([^-|]\+\?\)|=\1|=g' \
          | sort | uniq > "$PWFL"

if [ $(cat "$FPL" "$PWFL" | wc -l) -eq 0 ] ; then
  echo "Nothing to do!"
  exit
fi

echo "Inspecting packages:"
rm -f "$FFL"
cd "tmp"
cat "../$FPL" "../$PWFL" | awk -F '|' '{ print $2 "-" $3 "." $4 }' \
  | sort | uniq | while read rpm ; do
  echo -n " – $rpm"
  mkdir "$rpm"
  cd "$rpm"
  echo -n " ◔"
  rpm_loc=$(repoquery --repofrompath=$REPOID,$REPOURL --repoid=$REPOID --location "$rpm" 2>/dev/null)
  if $(echo "$rpm_loc" | grep -q -e "^file://") ; then
    rpm_file=$(echo "$rpm_loc" | sed "s=^file://=/=")
  else
    wget --quiet -O "$rpm.rpm" "$rpm_loc"
    rpm_file="$rpm.rpm"
  fi
  echo -ne "\b◑"
  rpmlint_score=$(rpmlint -i "$rpm_file" > "../rpmlint-$rpm.txt" 2>&1 ; echo $?)
  rpm2cpio "$rpm_file" > "$rpm.cpio"
  echo -ne "\b◕"
  cpio --quiet -it < "$rpm.cpio" > "$rpm.lst"
  nofont_score=$(cat "$rpm.lst" \
    | grep -v "^./usr/share/fonts" \
    | grep -v "^./usr/share/fontconfig/conf.avail/" \
    | grep -v "^./etc/fonts/conf.d/" \
    | grep -v "^./usr/share/doc/" \
    | grep -v "^./etc/X11/fontpath.d/" \
    | grep -viE '\.((ttf)|(ttc)|(otf)|(pfa)|(pfb)|(bdf)|(pcf)|(pcf\.gz))$' |wc -l)
  cat "$rpm.lst" \
    | grep -iE '\.((ttf)|(ttc)|(otf)|(pfa)|(pfb)|(bdf)|(pcf)|(pcf\.gz))$' \
    > "$rpm.fonts.lst"
  cpio -idm --quiet -E "$rpm.fonts.lst" < "$rpm.cpio"
  echo -ne "\b● "
  cat "$rpm.fonts.lst" | while read file; do
    unset target checksum type family style format
    type=$(file -bzh "$file")
    case $(echo "$type" | sed 's+ (\(.*\)++g' \
                             | sed 's+ `\(.*\)++g' \
                             | sed 's+,\(.*\)++g' \
                             | sed 's+\( \)*$++g' ) in
      "TrueType font data")
        echo -n "t"
        ;;
      "TrueType font collection data")
        echo -n "T"
        ;;
      "OpenType font data")
        echo -n "o"
        ;;
      "X11 Portable Compiled Font data")
        echo -n "b"
        ;;
      "X11 BDF font text")
        echo -n "B"
        ;;
      # PostScript files are a mess
      "PostScript Type 1 font text")
        echo -n "P"
        ;;
      "PostScript Type 1 font program data")
        echo -n "p"
        ;;
      "PostScript document text conforming DSC level 3.0")
        type="ignored"
        echo -n "-"
        ;;
      "PostScript document text"|"8086 relocatable")
        echo -n "x" >&2
        ;;
      "symbolic link to"|"broken symbolic link to")
        target=$(readlink -m "$file" | sed "s+^$PWD++g")
        if $(echo "$target" | grep -q "^/usr/share/fonts") ; then
          type="Link"
          echo -n "l"
        else
          type="ignored"
          echo -n "-"
        fi
        ;;
      *)
        echo -n "?"
        ;;
    esac
    if [ "$type" != "ignored" ] ; then
      size=$(du -b "$file" | awk '{ print $1 ; exit }')
      if [ ! -h "$file" ] ; then
        checksum=$(sha256sum "$file" | awk '{ print $1 ; exit }')
        if $(fc-query "$file" 2> /dev/null > "$file.desc") ; then
          family=$(parse_localized_fc_query family "$file.desc")
          style=$(parse_localized_fc_query style "$file.desc")
          format=$(parse_localized_fc_query fontformat "$file.desc")
        else
          echo -ne "\bX" >&2
        fi
      fi
      file=$(echo "$file" | sed "s+^./+/+g")
      echo "$rpm|$rpmlint_score|$nofont_score\
|$file|$family|$style|$format|$type|$size|$checksum|$target" >> "../../$FFL"
    fi
  done
  cd ..
  rm -fr "$rpm"
  echo " ♻"
done
cd ..
}


consolidate() {
echo "Consolidating data…"

rm -f "$FL"
cat "$PWFL" | while read rpmline; do
  grep -q "$rpmline" "$FPL" && metadata="M" || metadata=""
  rpm=$(echo "$rpmline" | awk -F '|' '{ print $2 "-" $3 "." $4 ; exit }')
  awk -F '|' -v rpm="$rpm" '$1 == rpm' "$FFL" \
  | while read rawfileline ; do
    echo "$rawfileline" | \
      awk -F "|" '{ for (i = 2; i < NF; i++) list = list $i "|" }
                  END { print list $NF }' \
      | while read fileline ; do
      if [ "$(echo $fileline| awk -F '|' '{ print $7 }')" == "Link" ] ; then
        source="$(awk -F '|' -v target=""$(echo $fileline | \
                                           awk -F '|' '{ print $10 }')"" \
                   '$4 == target { print $1 ; exit }' ""$FFL"" )"
      else source=""
      fi
     echo "$rpmline|$metadata|$fileline|$source" >> "$FL"
     done
  done
done


awk -F '|' '$15 != "" { print $1 "|" $2 "|" $9 "|" $15 }' "$FL" | sort | uniq \
  | while read sig ; do
    awk -F '|' -v sig="$sig" \
        '($1 "|" $2 "|" $9 "|" $15) == sig { print $0 ; exit }' "$FL"
  done > "$FLNM"
}


analyse() {

echo ""
echo "Problem report:"
echo ""

echo "SRPM|RPM|File" > "$TSUM"

echo -n "— "
test_help arch-package title

awk -F '|' '($13 != "Link") && ($4 != "noarch")' "$FL" > tmp/tmp.csv

[ $(cat tmp/tmp.csv | wc -l) -gt 0 ] && \
awk -F '|' '{ print $2 "." $4 "|" $6 }' tmp/tmp.csv | sort | uniq \
  | awk -F '|' '{ if ( $2 == "M" ) list=(list " [" $1 "]") ;
                              else list=(list " "  $1    ) } END \
                { print list }' | pretty_indent

test_wrapup arch-package tmp/tmp.csv "$TSUM"


echo -n "— "
test_help outside-usr-share-fonts title }

awk -F '|' '($13 != "Link") && ($9 !~ /^\/usr\/share\/fonts\//)' "$FL" > tmp/tmp.csv

list-rpm tmp/tmp.csv
test_wrapup outside-usr-share-fonts tmp/tmp.csv "$TSUM"


echo -n "— "
test_help mixed-with-non-font-data title

awk -F '|' '($13 != "Link") && ($8 != "0")' "$FL" > tmp/tmp.csv

list-rpm tmp/tmp.csv
test_wrapup mixed-with-non-font-data tmp/tmp.csv "$TSUM"


echo -n "— "
test_help without-rpm-metadata title

awk -F '|' '($13 != "Link") && ($6 != "M")' "$FL" > tmp/tmp.csv

list-rpm tmp/tmp.csv
test_wrapup without-rpm-metadata tmp/tmp.csv "$TSUM"


echo -n "— "
test_help bad-rpm-naming title

awk -F "|" '($13 != "Link") && \
            $2 !~ /^[0-9abcdefghijklmnopqrstuvwxyz\.-]*-fonts$/' "$FL" \
            > tmp/tmp.csv

list-rpm tmp/tmp.csv
test_wrapup bad-rpm-naming tmp/tmp.csv "$TSUM"


echo -n "— "
test_help bad-family-naming title

rm -f tmp/tmp.csv
touch tmp/tmp.csv
awk -F "|" '{ print $10 }' "$FL" | sort | uniq \
  | while read family ; do
    echo -n "$family|"
    echo " $family " | sed "s=[ \t_\.-]\+= =g"
  done > tmp/tmp1.csv
for token in Book Normal Regular Upright ita ital italic cursive \
             kursiv inclined oblique backslanted backslant slanted \
             extra_compressed ext_compressed ultra_compressed \
             ultra_condensed ultra_cond UltraCondensed compressed \
             extra_condensed ext_condensed extra_cond ext_cond \
             ExtraCondensed narrow compact semi_condensed semi_cond \
             SemiCondensed wide semi_expanded semi_extended SemiExpanded \
             extra_expanded ext_expanded extra_extended ext_extended \
             ExtraExpanded ultra_expanded ultra_extended UltraExpanded \
             condensed cond expanded extended extra_thin ext_thin \
             ultra_thin Thin extra_light ext_light ultra_light ExtraLight \
             semi_bold demi_bold DemiBold extra_bold ext_bold ultra_bold \
             ExtraBold extra_black ext_black ultra_black ExtraBlack bold \
             light medium black heavy nord demi ultra ; do
  token=$(echo " $token " | sed 's+_+ +g')
  awk -F "|" -v IGNORECASE=1 -v token="$token" \
  '$2 ~ token { print $1 }' tmp/tmp1.csv
done | sort | uniq | while read family ; do
  awk -F "|" -v family="$family" '$10 == family' "$FL" >> tmp/tmp.csv
done

awk -F "|" '{ print $10 }' tmp/tmp.csv | sort | uniq \
  | while read family ; do
    rpmlist=$(awk -F "|" -v family="$family" '$10 == family \
      { if ( $6 == "M" ) print "[" $2 "]" ; \
                    else print     $2     }' tmp/tmp.csv \
      | sort | uniq | while read rpm ; do echo -n "$rpm " ; done)
    echo "$family|$rpmlist"
  done | column -t -s '|' | pretty_indent

test_wrapup bad-family-naming tmp/tmp.csv "$TSUM"


echo -n "— "
test_help bad-style-naming title
echo -n "   "

#FIXME It would also be great to check if the naming is corrected by the
#      fontconfig files shipped in the package, but this requires fixes in
#      fc-scan (ability to process non-deployed fontconfig files)
rm -f tmp/tmp.csv
touch tmp/tmp.csv
awk -F "|" '{ print $11 }' "$FL" | sort | uniq \
  | while read face ; do
    echo -ne "\b○"
    rface=$(wws_resolve "$face" "Book Normal Regular Roman Upright")
    echo -ne "\b◔"
    rface=$(wws_resolve "$rface" "ita ital italic cursive kursiv inclined \
                                  oblique backslanted backslant slanted")
    echo -ne "\b◑"
    rface=$(wws_resolve "$rface" "extra_compressed ext_compressed \
                                  ultra_compressed ultra_condensed \
                                  ultra_cond UltraCondensed compressed \
                                  extra_condensed ext_condensed extra_cond \
                                  ext_cond ExtraCondensed narrow compact \
                                  semi_condensed semi_cond SemiCondensed \
                                  wide semi_expanded semi_extended \
                                  SemiExpanded extra_expanded ext_expanded \
                                  extra_extended ext_extended ExtraExpanded \
                                  ultra_expanded ultra_extended \
                                  UltraExpanded condensed cond expanded \
                                  extended")
    echo -ne "\b◕"
    rface=$(wws_resolve "$rface" "extra_thin ext_thin ultra_thin Thin \
                                  extra_light ext_light ultra_light \
                                  ExtraLight semi_bold demi_bold DemiBold \
                                  extra_bold ext_bold ultra_bold ExtraBold \
                                  extra_black ext_black ultra_black \
                                  ExtraBlack bold light medium black heavy \
                                  nord demi ultra")
    if [ "$rface" != "" ] ; then
      awk -F "|" -v face="$face" '$11 == face' "$FL" >> tmp/tmp.csv
      echo -ne "\bx "
    else echo -ne "\b●"
    fi
  done
echo ""

[ $(cat tmp/tmp.csv | wc -l) -gt 0 ] && \
awk -F "|" '{ print $10 ", " $11 }' tmp/tmp.csv | sort | uniq \
  | while read fontface ; do
    awk -F "|" -v fontface="$fontface" '($10 ", " $11) == fontface \
      { if ( $6 == "M" ) print fontface "|" $9 "|[" $2 "]" ; \
                    else print fontface "|" $9 "|"  $2     }' tmp/tmp.csv \
      | sort | uniq
  done | column -t -s '|' | pretty_indent

test_wrapup bad-style-naming tmp/tmp.csv "$TSUM"


echo -n "— "
test_help duplicated-file title

awk -F '|' '{ print $15 }' "$FLNM" | sort | uniq -d \
  | while read checksum ; do
    awk -F '|' -v checksum="$checksum" '$15==checksum' "$FLNM"
done > tmp/tmp.csv

awk -F '|' '{ print $15 }' tmp/tmp.csv | uniq \
  | while read checksum ; do
    awk -F '|' -v checksum="$checksum" '$15==checksum \
        { if ( $6 == "M" ) print $9 "|[" $2 "." $4 "]" ;
          else             print $9 "|"  $2 "." $4  }' \
    tmp/tmp.csv | column -t -s '|' | pretty_indent
  done

test_wrapup duplicated-file tmp/tmp.csv "$TSUM"


echo -n "— "
test_help duplicated-face-ext title

rm -f tmp/tmp.csv
touch tmp/tmp.csv
awk -F '|' '($10 != "") && ($11 != "") \
    { print $2 "-" $3 "." $4 "|" $10 "|" $11 }' "$FLNM" \
    | sort | uniq | awk -F '|' '{ print $2 "|" $3 }' \
    | sort | uniq -d | while read face ; do
    awk -F '|' -v face="$face" \
        '($12 != "PCF") && (($10 "|" $11)==face)' "$FLNM" > tmp/tmp1.csv
    packages=$(awk -F '|' '{ if ( $6 == "M" ) print "[" $2 "]" ; \
                             else print $2 }' tmp/tmp1.csv \
               | sort | uniq | while read rpm ; do echo -n "$rpm " ; done)
    count=$(awk -F '|' '{ print $2 "-" $3 "." $4 }' tmp/tmp1.csv \
            | sort | uniq | wc -l)
    cat tmp/tmp1.csv >> tmp/tmp.csv
    echo "$count|$face|$packages"
  done | sort -nr | column -t -s '|' | pretty_indent

test_wrapup duplicated-face-ext tmp/tmp.csv "$TSUM"


echo -n "— "
test_help duplicated-face-int title

awk -F '|' '($10 != "") && ($11 != "") && ($12 != "PCF") && ($12 != "Type 1") \
    { print $2 "-" $3 "." $4 "|" $10 "|" $11 }' "$FLNM" \
  | sort | uniq -d | while read sig ; do
    awk -F '|' -v sig="$sig" \
        '($12 != "PCF") && ($12 != "Type 1") && \
         (($2 "-" $3 "." $4 "|" $10 "|" $11 ) == sig)' \
         "$FLNM" ;
    done > tmp/tmp.csv

awk -F '|' '{ print $2 "|" $10 "|" $11 "|" $9 }' tmp/tmp.csv \
  | column -t -s '|' | pretty_indent

test_wrapup duplicated-face-int tmp/tmp.csv "$TSUM"


echo -n "— "
test_help family-mixing title

awk -F '|' '($10 != "")
    {
      rpm = $2 "-" $3 "." $4
      if ( ! family[rpm] ) { family[rpm] = $10 }
      else { if ( family[rpm] != $10 ) { mixed[rpm] = 1 } }
    }
    END {
      for ( rpm in mixed ) { print rpm }
    }' "$FL" | while read rpm ; do
      awk -F '|' -v rpm="$rpm" '(($2 "-" $3 "." $4) == rpm)' "$FL"
    done > tmp/tmp.csv

list-rpm tmp/tmp.csv
test_wrapup family-mixing tmp/tmp.csv "$TSUM"


echo -n "— "
test_help font-linking title

awk -F '|' '$13=="Link"' "$FL" > tmp/tmp.csv

list-rpm tmp/tmp.csv
test_wrapup font-linking tmp/tmp.csv "$TSUM"


echo -n "— "
test_help broken-symlink title

awk -F '|' '($13 == "Link") && ($17 == "")' "$FL" > tmp/tmp.csv
awk -F '|' '{ print $9 " → " $16 "|" $2 "-" $3 "." $4  }' tmp/tmp.csv \
  | column -t -s "|" | pretty_indent

test_wrapup broken-symlink tmp/tmp.csv "$TSUM"


echo -n "— "
test_help rpmlint title

awk -F '|' '($13 != "Link") && ($7 != "0")' "$FL" > tmp/tmp.csv

list-rpm tmp/tmp.csv
test_wrapup rpmlint tmp/tmp.csv "$TSUM"


echo -n "— "
test_help libmagic title

awk -F '|' '($13 !~ /font/) && ($13 !~ /Font/) && ($13 != "Link")' "$FL" > tmp/tmp.csv

[ $(cat tmp/tmp.csv | wc -l) -gt 0 ] && \
awk -F '|' '{ print $2 "-" $3 "." $4 }' tmp/tmp.csv | uniq \
  | while read rpm ; do
    awk -F '|' -v rpm="$rpm" '($2 "-" $3 "." $4) == rpm \
        { sum+=1 ; srpm =$1 } END \
        { print sum "|" rpm "|(" srpm ")|" }' tmp/tmp.csv
  done | sort -nr | column -t -s '|' | pretty_indent

test_wrapup libmagic tmp/tmp.csv "$TSUM"


echo -n "— "
test_help fc-query title

awk -F '|' '($13 != "Link") && ($12 == "")' "$FL" > tmp/tmp.csv

[ $(cat tmp/tmp.csv | wc -l) -gt 0 ] && \
awk -F '|' '{ print $2 "-" $3 "." $4 }' tmp/tmp.csv | uniq \
  | while read rpm ; do
    awk -F '|' -v rpm="$rpm" '($2 "-" $3 "." $4) == rpm \
        { sum+=1 ; srpm =$1 } END \
        { print sum "|" rpm "|(" srpm ")|" }' tmp/tmp.csv
  done | sort -nr | column -t -s '|' | pretty_indent

test_wrapup fc-query tmp/tmp.csv "$TSUM"


echo -n "— "
test_help no-english-metadata title

awk -F '|' '($12 != "") && (($10 == "") || ($11 == ""))' "$FL" > tmp/tmp.csv

[ $(cat tmp/tmp.csv | wc -l) -gt 0 ] && \
awk -F '|' '{ print $9 "|" $2 "-" $3 "." $4 }' tmp/tmp.csv \
  | column -t -s '|' | pretty_indent

test_wrapup no-english-metadata tmp/tmp.csv "$TSUM"

}


sum_up() {
echo ""
echo "Statistics:"
echo ""

echo "– packages that declare font metadata:"
echo ""

awk -F '|' '$6=="M"' "$FL" > tmp/tmp.csv
stats tmp/tmp.csv

echo "☛ File size is computed as extracted, while rpm is a compressed \
format." | pretty_indent
echo "☛ Mid-term, files in legacy PCF or Type1 formats need to be converted \
or removed." | pretty_indent

echo "– font files in other packages (we should not find any!)"
echo ""

awk -F '|' '($6 != "M") && ($13 != "Link")' "$FL" > tmp/tmp.csv
stats tmp/tmp.csv

[ $(cat tmp/tmp.csv | wc -l) -gt 0 ] && \
echo "☛ Bad packaging may result in arched packages or mixed content." \
| pretty_indent

echo "– problem summary:"
echo ""

test_score "$TSUM"

}


prepare_spam() {
mkdir spam

for srpm in $(awk -F '|' 'FNR>1 { print $1 }' "$TSUM" | uniq) ; do
  mkdir "tmp/$srpm/"
  cp -p report.txt "tmp/$srpm/repo-report.txt"
  for file in "$FL" "$TSUM" ; do
    awk 'FNR==1' "$file" > "tmp/$srpm/$file"
    awk -F '|' -v srpm="$srpm" 'FNR>1 && $1==srpm' "$file" >> "tmp/$srpm/$file"
  done
  for rpm in $(awk -F '|' 'FNR>1 { print $2 }' "tmp/$srpm/$TSUM" | uniq) ; do
    cp -p tmp/rpmlint-$rpm-*.txt "tmp/$srpm/"
  done
  cat > tmp/$srpm/message.txt << EOF
Dear packager,

At $TIMESTAMP, while scanning the $ID repository located at:
$REPOURL
I have identified the following problems in your $srpm package:

$(test_score "tmp/$srpm/$TSUM")

Please take the appropriate measures to fix the $srpm package.

I will warn you again if I find problems next time I am ran.

Your friendly QA robot,

-- 
repo-font-audit
http://fedoraproject.org/wiki/fontpackages
EOF
  cd tmp
  tar cf "$srpm.tar" "$srpm"
  cd ..
  xz -9  "tmp/$srpm.tar"
  mv "tmp/$srpm.tar.xz" spam
done
cat > spam/send-messages.sh << EOF
#!/bin/sh
# Send warnings to problem package owners
# This is a bit distribution-specific, people from other distributions are
# welcome to suggest how to make it more agnostic

#EMAIL="repo-font-audit <your@mail>"
#REPLYTO=another@mail

#export EMAIL REPLYTO

for srpm in \\
$(awk -F '|' 'FNR>1 { print $1 " \\" }' "$TSUM" | uniq)
; do
  tar -xf \$srpm.tar.xz */message.txt -O \\
    | mutt -s "Problems detected in the \$srpm $ID package!" \\
           -a \$srpm.tar.xz -- \$srpm-owner@fedoraproject.org
done

EOF
chmod +x spam/send-messages.sh
}

pack_data() {
mv spam "$SPAM"

mkdir "$RES/"
cp "$FL" "$TSUM" tmp/rpmlint-*.txt summary.txt report.txt "$RES/"

mkdir "$SRES/"
cp "$TSUM" summary.txt report.txt "$SRES/"

for report in "$RES" "$SRES" "$SPAM" ; do
  tar cf "$report.tar" "$report"
  xz -9  "$report.tar"
  mv "$report.tar.xz" "$ORIGDIR/"
done

cat << EOF
1. Complete extracted data: $ORIGDIR/$RES.tar.xz
2. Short summary: $ORIGDIR/$SRES.tar.xz
3. Mail data: $ORIGDIR/$SPAM.tar.xz

Generated using the repo-font-audit command from
http://fedoraproject.org/wiki/fontpackages
EOF
}

# End of function declarations

[ "$#" -lt "2" ] && usage

TIMESTAMP=$(date -u +%Y%m%dT%H%M%SZ)
ID=$1
REPOID="$1-rfa-$TIMESTAMP"
REPOURL=$2
ORIGDIR="$PWD"

FPL="font-packages.csv"
PWFL="packages-with-fonts.csv"
FFL="font-files.csv"
CSL="checksums.csv"
FL="consolidated-data.csv"
FLNM="consolidated-data-no-multilib.csv"
TSUM="test-summary.csv"
RES="repo-font-audit-$ID-$TIMESTAMP"
SRES="repo-font-audit-$ID-$TIMESTAMP-short"
SPAM="repo-font-audit-$ID-$TIMESTAMP-mail"

TMPDIR=$(mktemp -d --tmpdir=/tmp $RES-XXXXXXXXXX)
cd $TMPDIR

mkdir tmp

collect
consolidate
analyse | tee report.txt
sum_up  | tee summary.txt

sed -i "s=.$(echo -ne '\b')==g" report.txt

cp "$FL" "tmp/$FL"
echo "SRPM|RPM|EVR|Arch|RPM size|RPM metadata|rpmlint score|foreign data|filename|family name|face name|format (fontconfig)|format (libmagic)|file size|checksum|symlink target|symlink provider" > "$FL"
cat "tmp/$FL" >> "$FL"

prepare_spam
pack_data

cd "$ORIGDIR"
rm -fr "$TMPDIR"
echo "♻"
