1 files changed, 397 insertions, 0 deletions
diff --git a/scripts/urlinfo.pl b/scripts/urlinfo.pl
new file mode 100644
index 0000000..8715ebb
--- /dev/null
+++ b/scripts/urlinfo.pl
@@ -0,0 +1,397 @@
+use 5.014;
+use utf8;
+use Irssi;
+use POSIX ();
+
+our $VERSION = "1.1";
+our %IRSSI = (
+    authors     => 'David Leadbeater',
+    contact     => 'dgl@dgl.cx',
+    name        => 'urlinfo',
+    description => 'Print short summaries about URLs from known services that are mentioned on IRC. (Including YouTube, etc.)',
+    license     => 'WTFPL <http://dgl.cx/licence>',
+    url         => 'http://dgl.cx/irssi',
+);
+
+# This is needed so it can still run standalone for testing.
+BEGIN {
+  Irssi->import(20140628) if __PACKAGE__ =~ /Irssi/; # This needs Irssi 0.8.17
+}
+
+BEGIN {
+  eval {
+    require HTML::TreeBuilder;
+    require URI;
+  } or do {
+    print "\x{3}8You need to install HTML::TreeBuilder and URI";
+    if (-f "/etc/debian_version") {
+      print "Try running: \x{3}9sudo apt-get install libhtml-treebuilder-perl libwww-perl";
+    }
+    die $@;
+  }
+}
+
+# ------- Settings
+# /SET urlinfo_title_unknown ON|OFF
+#   Show title of all unknown sites. (There is a timeout to prevent against
+#   obvious resource exhaustion attacks, but remember this script has no
+#   warranty.)
+#
+# /SET urlinfo_timeout 10
+#   How many seconds after which to give up trying to fetch a URL
+#
+# /SET urlinfo_ignore_domains example\.org example\.com
+#   Space separated list of regular expressions of domains to ignore
+#
+# /SET urlinfo_custom_domains my\.domain/thing irssi\.org=description
+#   A limited way of configuring custom domains, if you need something more
+#   complex edit SITES below.
+#   Format: domain[/path[=from]]
+
+# ------- Sites configuration
+
+# This script aims to be data driven, this hash has "site_name => {site details}"
+# site details is a hash reference which can contain:
+#   cleanup: A regexp of text to remove from the resulting string
+#   domain: A string (or regexp, with qr//) of the domain to match (www. is
+#     removed automatically).
+#   from: Where to read the info "title", "description" (meta description) or
+#     a regexp to match the content (default "title")
+#   example: Example of this URL
+#   expected: What the example should return (see end for testing)
+#   items: An array ref of additional hashes to allow multiple of these values
+#     e.g.: items => [ { domain => "example.com" } ]
+#   path: Path component (string or regexp)
+#
+my %SITES = (
+  vimeo => {
+    cleanup => qr/\s*on Vimeo$/,
+    domain => "vimeo.com",
+    path => qr{/\d+},
+    example => "http://vimeo.com/80871338",
+    expected => "Journey Part 1",
+  },
+  youtube => {
+    cleanup => qr/\s*-\s*YouTube$/,
+    items => [
+      {
+        domain => "youtu.be",
+        example => "http://youtu.be/wa1c6EU2bY0",
+        expected => "I Am The Resurrection (Remastered)",
+      },
+      {
+        domain => "youtube.com",
+        path => "/watch",
+        example => "https://www.youtube.com/watch?v=q99JgYrgzco&list=PLE57B71744156439A",
+        expected => "I Wanna Be Adored (Remastered)",
+      },
+    ],
+  },
+  metacpan => {
+    cleanup => qr/\s*-\s*metacpan\.org$/,
+    domain => "metacpan.org",
+    path => qr{^/pod/},
+    example => "https://metacpan.org/pod/release/DOY/Reply-0.34/lib/Reply.pm",
+    from => "description",
+    expected => "read, eval, print, loop, yay!",
+  },
+  pypi => {
+    domain => "pypi.python.org",
+    path => qr{^/pypi/},
+    from => "description",
+    example => "https://pypi.python.org/pypi/stanford-corenlp-python/3.3.6-0",
+    expected => "A Stanford Core NLP wrapper (wordseer fork)",
+  },
+  gist => {
+    cleanup => qr/\s*-\s*Gist is .*$/,
+    domain => "gist.github.com",
+    from => ["og:title", "description"],
+    example => "https://gist.github.com/dgl/792206",
+    expected => "dgl/installblead: An install script that installs a development version of perl (from git) and keeps a particular set of modules installed. Sort of perlbrew for blead, but not quite.",
+  },
+  github => {
+    domain => "github.com",
+    items => [
+      { # issue, commit or pull
+        cleanup => qr/\s*·.*$/,
+        path => qr{^/[^/]+/[^/]+/(?:issues|commit|pull)/[a-f0-9]},
+        example => "https://github.com/irssi/irssi/commit/669add",
+        expected => "FS#155 hilight -tag",
+      },
+      { # user or project
+        from => "og:description",
+        path => qr{^/[^/]+(?:/[^/]+)?$},
+        example => "https://github.com/irssi/irssi",
+        expected => "irssi - The client of the future",
+      },
+    ],
+  },
+);
+
+# ------- Site handling
+
+sub expand {
+  my @expanded_sites;
+  for my $site(keys %SITES) {
+    expand_site(\@expanded_sites, $site, $SITES{$site}, {});
+  }
+  return @expanded_sites;
+}
+
+# This essentially implements inheritance (via the "items" key), to reduce
+# duplication.
+sub expand_site {
+  my($expanded_sites, $site, $site_data, $current) = @_;
+  delete $current->{items};
+  my $s = {
+    name => $site,
+    from => "title",
+    %$current,
+    %$site_data,
+  };
+  if (exists $s->{items}) {
+    expand_site($expanded_sites, $site, $_, $s) for @{$s->{items}};
+  } else {
+    push @$expanded_sites, $s;
+  }
+}
+
+sub _matcher {
+  my($site, $item) = @_;
+  return 1 unless defined $site;
+  return 1 if ref $site eq 'ARRAY' && grep _matcher($_, $item), @$site;
+  return 1 if ref $site && $site->isa("Regexp") && $item =~ $site;
+  return $site eq $item;
+}
+
+sub get_site {
+  my($sites, $url) = @_;
+
+  my $uri = URI->new($url);
+  $uri = URI->new("http://$url") unless $uri and $uri->scheme;
+  return unless $uri and $uri->can("host") and $uri->host and $uri->scheme =~ /^https?$/;
+  
+  for my $site(@$sites) {
+    my $match = 1;
+    $match &&= _matcher($site->{domain}, $uri->host =~ s/^www\.//ri);
+    $match &&= _matcher($site->{$_}, $uri->$_) for qw(scheme host path query fragment);
+    return $site, $uri if $match;
+  }
+
+  if (Irssi::settings_get_bool("urlinfo_title_unknown")) {
+    return { name => "unknown", from => "title" }, $uri;
+  }
+
+  return;
+}
+
+my %from = (
+  title => sub {
+    $_[0]->look_down(_tag => 'title')->as_trimmed_text;
+  },
+  description => sub {
+    my $el = $_[0]->look_down(_tag => 'meta', name => 'description');
+    $el && $el->attr('content');
+  },
+  'og:description' => sub {
+    my $el = $_[0]->look_down(_tag => 'meta', property => 'og:description');
+    $el && $el->attr('content');
+  },
+  'og:title' => sub {
+    my $el = $_[0]->look_down(_tag => 'meta', property => 'og:title');
+    $el && $el->attr('content');
+  },
+);
+
+sub get_info {
+  my($site, $uri) = @_;
+  my $from = $site->{from};
+  my $tree = HTML::TreeBuilder->new_from_url($uri);
+  my $info;
+  if (!ref $from || ref $from eq 'ARRAY') {
+    $info = join ": ", grep defined, map $from{$_}->($tree), ref $from ? @$from : $from;
+  } else {
+    $info = join "", $tree->as_html =~ $from;
+  }
+  $info =~ s/$site->{cleanup}// if $site->{cleanup};
+  $info =~ s/([\x00-\x19])/sprintf "\\x%x", ord $1/ger;
+}
+
+# ------- IRC message handling
+
+# John Gruber's URL regexp (nicely handles people putting URLs in parens, etc)
+my $URL_RE = qr{((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))};
+
+my $pipe_in_progress;
+my @sites;
+my $timeout = 10;
+
+sub msg {
+  my($server, $text, $nick, undef, $target) = @_;
+  # TODO: Add a queue / multiple pipe support?
+  return if $pipe_in_progress;
+
+  my $msg_time = time;
+  my $tag = $server->{tag};
+  my $target = $target || $nick;
+
+  if (my($url) = $text =~ $URL_RE) {
+    my($site, $uri) = get_site(\@sites, $url);
+    return unless $site;
+    return if ignored($uri);
+
+    fork_wrapper(sub { # Child
+      my($fh) = @_;
+      syswrite $fh, "  " . get_info($site, $uri);
+    },
+    sub { # Parent
+      my($in) = @_;
+      if ($in =~ s/^- //) {
+        print "\x{3}4urlinfo error:\x{3} $in";
+        return;
+      }
+      $in =~ s/^  //;
+      return unless $in;
+
+      # Avoid reusing server just in case it is no longer valid
+      my $server = Irssi::server_find_tag($tag);
+      my $win = find_window($server, $target);
+
+      my $view = $win->view;
+      my $line = $view->get_lines;
+      while ($line && ($line = $line->next)) {
+        if ($line->{info}->{time} >= $msg_time) {
+          if ($line->get_text(0) =~ /\Q$url/) {
+            last;
+          }
+        }
+      }
+
+      my $timestamp = POSIX::strftime(
+        Irssi::settings_get_str("timestamp_format"), localtime $msg_time);
+      # I'm sure I shouldn't have to care about colours here...
+      my $pad = length Irssi::strip_codes($timestamp);
+
+      my $text = $win->format_get_text(__PACKAGE__, $server, $target,
+        "urlinfo", " " x $pad, $in);
+      $win->print_after($line, MSGLEVEL_NO_ACT|MSGLEVEL_CLIENTCRAP,
+        $text, $msg_time);
+      $view->redraw;
+    });
+  }
+}
+
+sub ignored {
+  my($uri) = @_;
+  my @ignored = split / /, Irssi::settings_get_str('urlinfo_ignore_domains');
+  my $domain = $uri->host =~ s/^www\.//r;
+  return grep $domain =~ /^$_$/, @ignored;
+}
+
+sub find_window {
+  my($server, $target) = @_;
+  if (my $witem = $server->window_item_find($target)) {
+    return $witem->window;
+  } else {
+    # Maybe they have a msgs window?
+    my $win = Irssi::window_find_name("(msgs)");
+    # Ultimate fallback
+    $win = Irssi::window_find_refnum(1) unless $win;
+    return $win;
+  }
+}
+
+# Based on scriptassist.
+sub fork_wrapper {
+  my($child, $parent) = @_;
+
+  pipe(my $rfh, my $wfh);
+
+  my $pid = fork;
+  $pipe_in_progress = 1;
+
+  return unless defined $pid;
+
+  if ($pid) {
+    close $wfh;
+    Irssi::pidwait_add($pid);
+    my $pipetag;
+    my @args = ($rfh, \$pipetag, $parent);
+    $pipetag = Irssi::input_add(fileno($rfh), Irssi::INPUT_READ, \&pipe_input, \@args);
+  } else {
+    eval {
+      local $SIG{ALRM} = sub { die "Timed out\n" };
+      alarm $timeout;
+      $child->($wfh);
+    };
+    alarm 0;
+    syswrite $wfh, "- $@" if $@;
+    POSIX::_exit(1);
+  }
+}
+
+sub pipe_input {
+  my ($rfh, $pipetag, $parent) = @{$_[0]};
+  my $line = <$rfh>;
+  close($rfh);
+  Irssi::input_remove($$pipetag);
+  $pipe_in_progress = 0;
+  $parent->($line);
+}
+
+sub setup_changed {
+  $timeout = Irssi::settings_get_int("urlinfo_timeout");
+
+  @sites = expand();
+  for my $site (split / /, Irssi::settings_get_str("urlinfo_custom_domains")) {
+    next unless $site;
+
+    my($re, $from) = split /=/, $site;
+    $from ||= "title";
+    my($domain, $path) = split m{/}, $re, 2;
+    expand_site(\@sites, "custom", {
+        domain => qr/^$domain$/,
+        path => defined $path ? qr/^\/$path/ : undef,
+        from => $from,
+    }, {});
+  }
+}
+
+# ------- Initialization
+
+if (caller) {
+  # Irssi specific initialization
+  require Irssi::TextUI;
+
+  Irssi::settings_add_str($IRSSI{name}, "urlinfo_custom_domains", "");
+  Irssi::settings_add_str($IRSSI{name}, "urlinfo_ignore_domains", "");
+  Irssi::settings_add_int($IRSSI{name}, "urlinfo_timeout", $timeout);
+  Irssi::settings_add_bool($IRSSI{name}, "urlinfo_title_unknown", 0);
+
+  Irssi::signal_add("message irc action" => \&msg);
+  Irssi::signal_add("message private" => \&msg);
+  Irssi::signal_add("message public" => \&msg);
+
+  Irssi::signal_add_last("setup changed", \&setup_changed);
+  setup_changed();
+
+  Irssi::theme_register([
+    'urlinfo' => '$0 %Kinfo:%n $1',
+  ]);
+
+} else {
+  # Built in test. Run this script outside Irssi to use.
+  @sites = expand();
+  for my $site(@sites) {
+    next unless $site->{example};
+    my($found_site, $uri) = get_site(\@sites, $site->{example});
+    if ($found_site != $site) {
+      die "Got $found_site->{name}, expected $site->{name}";
+    }
+    say "Get $uri";
+    my $result = get_info($site, $uri);
+    say $result;
+    die "Got $result, expected $site->{expected}" unless $result eq $site->{expected};
+  }
+  say "OK";
+}