#!/usr/bin/perl ###################################################################################################### # This script extracts nutritional info from recipe HTML pages, using hand-crafted regular expressions ###################################################################################################### use Digest::MD5 qw(md5 md5_hex md5_base64); $PATH = "*********"; $RECIPE_PAGES_PATH = "$PATH/recipePages"; $RECIPE_LIST_FILE = "$PATH/weight_uniqueRecipeUrls.tsv"; sub readFile { my $filename = shift; open(FILE, $filename) or return ""; my $string = join("", ); # remove the header line from the file content, such that files with only this line result in "": $string =~ s/^\n//; close(FILE); return $string; } sub printOutput { my $kcal = shift; my $carb = shift; my $fat = shift; my $prot = shift; my $sodium = shift; my $chol = shift; my $hash = shift; my $url = shift; print "$kcal\t$carb\t$fat\t$prot\t$sodium\t$chol\t$hash.html\t" . computeDomain($url) . "\t$url\n"; } sub computeDomain { my $url = shift; my $domain = $url; $domain =~ s|^http://(www.)?||; $domain =~ s|/.*$||; return $domain; } sub normalizeNumber { my $num = shift; $num =~ s/,//g; return $num; } sub carbToKcal { my $quant = shift; my $unit = shift; # must be 'g' or 'mg' my $factor = $unit eq 'mg' ? 0.001 : 1; return 4 * normalizeNumber($quant) * $factor; } sub fatToKcal { my $quant = shift; my $unit = shift; # must be 'g' or 'mg' my $factor = $unit eq 'mg' ? 0.001 : 1; return 9 * normalizeNumber($quant) * $factor; } sub protToKcal { my $quant = shift; my $unit = shift; # must be 'g' or 'mg' my $factor = $unit eq 'mg' ? 0.001 : 1; return 4 * normalizeNumber($quant) * $factor; } sub toMg { my $quant = shift; my $unit = shift; # must be 'g' or 'mg' my $factor = $unit eq 'mg' ? 1 : 1000; return normalizeNumber($quant) * $factor; } my @md5 = (); my @md5_inv = (); open(REC, $RECIPE_LIST_FILE) or die $!; while(my $url = ) { chomp $url; my $hash = md5_hex($url); $md5{$url} = $hash; $md5_inv{$hash} = $url; } close(REC); my %goodDomains = ( "allrecipes.com" => 1, "food.com" => 1, "yummly.com" => 1, "myrecipes.com" => 1, "recipes.sparkpeople.com" => 1, "bettycrocker.com" => 1, "foodnetwork.com" => 1, "cdkitchen.com" => 1, "eatingwell.com" => 1, "delish.com" => 1, "cookeatshare.com" => 1, "recipe.com" => 1, "kraftrecipes.com" => 1, "epicurious.com" => 1 ); my $count = 0; # - means file not found or without valid HTML content # ? means no match in file foreach my $url (keys %md5) { ++$count; print STDERR "$count\n" if ($count % 1000 == 0); next if (!defined($goodDomains{computeDomain($url)})); if ($url =~ m{^http://(www\.)?allrecipes\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{Amount Per Serving} && $html =~ m{([0-9\.,]+)}) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); if ($html =~ m{([0-9\.,]+)(g|mg)}) { $carb = carbToKcal($1, $2); } if ($html =~ m{([0-9\.,]+)(g|mg)}) { $fat = fatToKcal($1, $2); } if ($html =~ m{([0-9\.,]+)(g|mg)}) { $prot = protToKcal($1, $2); } if ($html =~ m{([0-9\.,]+)(g|mg)}) { $sodium = toMg($1, $2); } if ($html =~ m{([0-9\.,]+)(g|mg)}) { $chol = toMg($1, $2); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?food\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{Amount per serving} && $html =~ m{([0-9\.,]+)}) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); if ($html =~ m{([0-9\.,]+) (g|mg)}) { $carb = carbToKcal($1, $2); } if ($html =~ m{([0-9\.,]+) (g|mg)}) { $fat = fatToKcal($1, $2); } if ($html =~ m{([0-9\.,]+) (g|mg)}) { $prot = protToKcal($1, $2); } if ($html =~ m{([0-9\.,]+) (g|mg)}) { $sodium = toMg($1, $2); } if ($html =~ m{([0-9\.,]+) (g|mg)}) { $chol = toMg($1, $2); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?yummly\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{
Amount Per Serving
} && $html =~ m{([0-9\.,]+)}) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); if ($html =~ m{\s*([0-9\.,]+)\s*\s*(g|mg)}s) { $carb = carbToKcal($1, $2); } if ($html =~ m{\s*([0-9\.,]+)\s*\s*(g|mg)}s) { $fat = fatToKcal($1, $2); } if ($html =~ m{\s*([0-9\.,]+)\s*\s*(g|mg)}s) { $prot = protToKcal($1, $2); } if ($html =~ m{\s*([0-9\.,]+)\s*\s*(g|mg)}s) { $sodium = toMg($1, $2); } if ($html =~ m{\s*([0-9\.,]+)\s*\s*(g|mg)}s) { $chol = toMg($1, $2); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?myrecipes\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{Amount per serving} && $html =~ m/{"type":"calories","unit":"","amount":"([0-9\.,]+)"}/) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); if ($html =~ m/{"type":"carbohydrate","unit":"(g|mg)","amount":"([0-9\.,]+)"}/) { $carb = carbToKcal($2, $1); } # reversed argument order if ($html =~ m/{"type":"fat","unit":"(g|mg)","amount":"([0-9\.,]+)"}/) { $fat = fatToKcal($2, $1); } if ($html =~ m/{"type":"protein","unit":"(g|mg)","amount":"([0-9\.,]+)"}/) { $prot = protToKcal($2, $1); } if ($html =~ m/{"type":"sodium","unit":"(g|mg)","amount":"([0-9\.,]+)"}/) { $sodium = toMg($2, $1); } if ($html =~ m/{"type":"cholesterol","unit":"(g|mg)","amount":"([0-9\.,]+)"}/) { $chol = toMg($2, $1); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?recipes\.sparkpeople\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{Amount Per Serving
|
  • Amount Per Serving} && $html =~ m{  Calories\s*([0-9\.,]+)|Calories: ([0-9\.,]+)}s) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1||$2), '?', '?', '?', '?', '?'); # need to disambiguate between brackets in regex if ($html =~ m{  Total Carbohydrate\s*([0-9\.,]+) (g|mg)|
  • Total Carbs: ([0-9\.,]+) (g|mg)}s) { $carb = carbToKcal(normalizeNumber($1||$3), $2||$4); } if ($html =~ m{  Total Fat\s*([0-9\.,]+) (g|mg)|
  • Total Fat: ([0-9\.,]+) (g|mg)}s) { $fat = fatToKcal(normalizeNumber($1||$3), $2 || $4); } if ($html =~ m{  Protein\s*([0-9\.,]+) (g|mg)|
  • Protein: ([0-9\.,]+) (g|mg)}s) { $prot = protToKcal(normalizeNumber($1||$3), $2 || $4); } if ($html =~ m{  Sodium\s*([0-9\.,]+) (g|mg)|
  • Sodium: ([0-9\.,]+) (g|mg)}s) { $sodium = toMg(normalizeNumber($1||$3), $2 || $4); } if ($html =~ m{  Cholesterol\s*([0-9\.,]+) (g|mg)|
  • Cholesterol: ([0-9\.,]+) (g|mg)}s) { $chol = toMg(normalizeNumber($1||$3), $2 || $4); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?bettycrocker\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{1 Serving.*} && $html =~ m{
  • Calories ([0-9\.,]+)
  • }) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); # Ol' Betty Crocker needs the special regexation package... if ($html =~ m{
  • Total Carbohydrate ([0-9\.,]+)( (\d)/(\d))?(g|mg)
  • }) { $carb = carbToKcal($1+($4?$3/$4:0), $5); } if ($html =~ m{
  • Total Fat ([0-9\.,]+)( (\d)/(\d))?(g|mg)
  • }) { $fat = fatToKcal($1+($4?$3/$4:0), $5); } if ($html =~ m{
  • Protein ([0-9\.,]+)( (\d)/(\d))?(g|mg);?
  • }) { $prot = protToKcal($1+($4?$3/$4:0), $5); } if ($html =~ m{
  • Sodium ([0-9\.,]+)( (\d)/(\d))?(g|mg);?
  • }) { $sodium = toMg($1+($4?$3/$4:0), $5); } if ($html =~ m{
  • Cholesterol ([0-9\.,]+)( (\d)/(\d))?(g|mg);?
  • }) { $chol = toMg($1+($4?$3/$4:0), $5); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?foodnetwork\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{
    Per Serving
    } && $html =~ m{
    Calories
    ([0-9\.,]+)
    }) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); if ($html =~ m{
    Carbohydrates
    ([0-9\.,]+) *(g|mg)
    }) { $carb = carbToKcal($1, $2); } if ($html =~ m{
    (Total )?Fat
    ([0-9\.,]+) *(g|mg)
    }) { $fat = fatToKcal($2, $3); } # shifted index if ($html =~ m{
    Protein
    ([0-9\.,]+) *(g|mg)
    }) { $prot = protToKcal($1, $2); } if ($html =~ m{
    Sodium
    ([0-9\.,]+) *(g|mg)
    }) { $sodium = toMg($1, $2); } if ($html =~ m{
    Cholesterol
    ([0-9\.,]+) *(g|mg)
    }) { $chol = toMg($1, $2); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?cdkitchen\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{per serving} && $html =~ m{([0-9\.,]+) calories}) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); if ($html =~ m{([0-9\.,]+) (grams|milligrams) carbohydrates}) { $carb = carbToKcal($1, $2 eq 'grams' ? 'g' : 'mg'); } if ($html =~ m{([0-9\.,]+) (grams|milligrams) fat}) { $fat = fatToKcal($1, $2 eq 'grams' ? 'g' : 'mg'); } # map from abbrev to full if ($html =~ m{([0-9\.,]+) (grams|milligrams) protein}) { $prot = protToKcal($1, $2 eq 'grams' ? 'g' : 'mg'); } if ($html =~ m{Sodium \((g|mg)\)\s*([0-9\.,]+)}s) { $sodium = toMg($2, $1); } # inverted indices if ($html =~ m{Cholesterol \((g|mg)\)\s*([0-9\.,]+)}s) { $chol = toMg($2, $1); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?eatingwell\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{

    Per serving:} && $html =~ m{\s*([0-9\.,]+)}s) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); if ($html =~ m{\s*([0-9\.,]+) (g|mg)}s) { $carb = carbToKcal($1, $2); } if ($html =~ m{\s*([0-9\.,]+) (g|mg)}s) { $fat = fatToKcal($1, $2); } if ($html =~ m{\s*([0-9\.,]+) (g|mg)}s) { $prot = protToKcal($1, $2); } if ($html =~ m{\s*([0-9\.,]+) (g|mg)}s) { $sodium = toMg($1, $2); } if ($html =~ m{\s*([0-9\.,]+) (g|mg)}s) { $chol = toMg($1, $2); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?delish\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{\(per serving\)} && $html =~ m{([0-9\.,]+)}) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); if ($html =~ m{Total Carbohydrate()?(([0-9\.,]+) *(g|mg)|--|0)()?}) { $carb = carbToKcal($2 eq '--' ? 0 : $3, $4||'g'); } if ($html =~ m{Total Fat()?(([0-9\.,]+) *(g|mg)|--|0)()?}) { $fat = fatToKcal($2 eq '--' ? 0 : $3, $4||'g'); } # can have '--' instead of '0' if ($html =~ m{Protein()?(([0-9\.,]+) *(g|mg)|--|0)()?}) { $prot = protToKcal($2 eq '--' ? 0 : $3, $4||'g'); } if ($html =~ m{Sodium()?(([0-9\.,]+) *(g|mg)|--|0)()?}) { $sodium = toMg($2 eq '--' ? 0 : $3, $4||'g'); } if ($html =~ m{Cholesterol()?(([0-9\.,]+) *(g|mg)|--|0)()?}) { $chol = toMg($2 eq '--' ? 0 : $3, $4||'g'); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?cookeatshare\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{Amount Per Serving|

    Servings:\s*1\s*
    } && $html =~ m{([0-9\.,]+)}) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); if ($html =~ m{([0-9\.,]+)(g|mg)}) { $carb = carbToKcal($1, $2); } if ($html =~ m{([0-9\.,]+)(g|mg)}) { $fat = fatToKcal($1, $2); } if ($html =~ m{([0-9\.,]+)(g|mg)}) { $prot = protToKcal($1, $2); } if ($html =~ m{Sodium\s*([0-9\.,]+)(g|mg)}s) { $sodium = toMg($1, $2); } if ($html =~ m{([0-9\.,]+)(g|mg)}) { $chol = toMg($1, $2); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?recipe\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{Per serving:} && $html =~ m{Per serving: Calories ([0-9\.,]+)}) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); if ($html =~ m{Carbohydrate ([0-9\.,]+) (g|mg)}) { $carb = carbToKcal($1, $2); } if ($html =~ m{Total Fat ([0-9\.,]+) (g|mg)}) { $fat = fatToKcal($1, $2); } if ($html =~ m{Protein ([0-9\.,]+) (g|mg)}) { $prot = protToKcal($1, $2); } if ($html =~ m{Sodium ([0-9\.,]+) (g|mg)}) { $sodium = toMg($1, $2); } if ($html =~ m{Cholesterol ([0-9\.,]+) (g|mg)}) { $chol = toMg($1, $2); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?kraftrecipes\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{

    nutritional info per serving

    } && $html =~ m{([0-9\.,]+)\s*}) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); if ($html =~ m{([0-9\.,]+) (g|mg)}) { $carb = carbToKcal($1, $2); } if ($html =~ m{([0-9\.,]+) (g|mg)}) { $fat = fatToKcal($1, $2); } if ($html =~ m{([0-9\.,]+) (g|mg)}) { $prot = protToKcal($1, $2); } if ($html =~ m{Sodium\s*\s*
    \s* ([0-9\.,]+) (g|mg)}s) { $sodium = toMg($1, $2); } if ($html =~ m{([0-9\.,]+) (g|mg)}) { $chol = toMg($1, $2); } } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } elsif ($url =~ m{^http://(www\.)?epicurious\.com}) { my $html = readFile("$RECIPE_PAGES_PATH/$md5{$url}.html"); my ($kcal, $carb, $fat, $prot, $sodium, $chol) = ('', '', '', '', '', ''); if ($html =~ m{

    Nutritional analysis per serving} && $html =~ m{: ([0-9\.,]+) calories}) { ($kcal, $carb, $fat, $prot, $sodium, $chol) = (normalizeNumber($1), '?', '?', '?', '?', '?'); if ($html =~ m{([0-9\.,]+)\s*(g|mg) (carbohydrate|carbs)}) { $carb = carbToKcal($1, $2); } if ($html =~ m{([0-9\.,]+)\s*(g|mg) fat}) { $fat = fatToKcal($1, $2); } if ($html =~ m{([0-9\.,]+)\s*(g|mg) protein}) { $prot = protToKcal($1, $2); } # no sodium and cholesterol info } else { my $symbol = $html eq "" ? '-' : '?'; ($kcal, $carb, $fat, $prot, $sodium, $chol) = ($symbol, $symbol, $symbol, $symbol, $symbol, $symbol); } printOutput($kcal, $carb, $fat, $prot, $sodium, $chol, $md5{$url}, $url); } }