summaryrefslogtreecommitdiffstats
path: root/scripts/urlinfo.pl
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/urlinfo.pl')
-rw-r--r--scripts/urlinfo.pl397
1 files changed, 397 insertions, 0 deletions
diff --git a/scripts/urlinfo.pl b/scripts/urlinfo.pl
new file mode 100644
index 0000000..8715ebb
--- /dev/null
+++ b/scripts/urlinfo.pl
@@ -0,0 +1,397 @@
+use 5.014;
+use utf8;
+use Irssi;
+use POSIX ();
+
+our $VERSION = "1.1";
+our %IRSSI = (
+ authors => 'David Leadbeater',
+ contact => 'dgl@dgl.cx',
+ name => 'urlinfo',
+ description => 'Print short summaries about URLs from known services that are mentioned on IRC. (Including YouTube, etc.)',
+ license => 'WTFPL <http://dgl.cx/licence>',
+ url => 'http://dgl.cx/irssi',
+);
+
+# This is needed so it can still run standalone for testing.
+BEGIN {
+ Irssi->import(20140628) if __PACKAGE__ =~ /Irssi/; # This needs Irssi 0.8.17
+}
+
+BEGIN {
+ eval {
+ require HTML::TreeBuilder;
+ require URI;
+ } or do {
+ print "\x{3}8You need to install HTML::TreeBuilder and URI";
+ if (-f "/etc/debian_version") {
+ print "Try running: \x{3}9sudo apt-get install libhtml-treebuilder-perl libwww-perl";
+ }
+ die $@;
+ }
+}
+
+# ------- Settings
+# /SET urlinfo_title_unknown ON|OFF
+# Show title of all unknown sites. (There is a timeout to prevent against
+# obvious resource exhaustion attacks, but remember this script has no
+# warranty.)
+#
+# /SET urlinfo_timeout 10
+# How many seconds after which to give up trying to fetch a URL
+#
+# /SET urlinfo_ignore_domains example\.org example\.com
+# Space separated list of regular expressions of domains to ignore
+#
+# /SET urlinfo_custom_domains my\.domain/thing irssi\.org=description
+# A limited way of configuring custom domains, if you need something more
+# complex edit SITES below.
+# Format: domain[/path[=from]]
+
+# ------- Sites configuration
+
+# This script aims to be data driven, this hash has "site_name => {site details}"
+# site details is a hash reference which can contain:
+# cleanup: A regexp of text to remove from the resulting string
+# domain: A string (or regexp, with qr//) of the domain to match (www. is
+# removed automatically).
+# from: Where to read the info "title", "description" (meta description) or
+# a regexp to match the content (default "title")
+# example: Example of this URL
+# expected: What the example should return (see end for testing)
+# items: An array ref of additional hashes to allow multiple of these values
+# e.g.: items => [ { domain => "example.com" } ]
+# path: Path component (string or regexp)
+#
+my %SITES = (
+ vimeo => {
+ cleanup => qr/\s*on Vimeo$/,
+ domain => "vimeo.com",
+ path => qr{/\d+},
+ example => "http://vimeo.com/80871338",
+ expected => "Journey Part 1",
+ },
+ youtube => {
+ cleanup => qr/\s*-\s*YouTube$/,
+ items => [
+ {
+ domain => "youtu.be",
+ example => "http://youtu.be/wa1c6EU2bY0",
+ expected => "I Am The Resurrection (Remastered)",
+ },
+ {
+ domain => "youtube.com",
+ path => "/watch",
+ example => "https://www.youtube.com/watch?v=q99JgYrgzco&list=PLE57B71744156439A",
+ expected => "I Wanna Be Adored (Remastered)",
+ },
+ ],
+ },
+ metacpan => {
+ cleanup => qr/\s*-\s*metacpan\.org$/,
+ domain => "metacpan.org",
+ path => qr{^/pod/},
+ example => "https://metacpan.org/pod/release/DOY/Reply-0.34/lib/Reply.pm",
+ from => "description",
+ expected => "read, eval, print, loop, yay!",
+ },
+ pypi => {
+ domain => "pypi.python.org",
+ path => qr{^/pypi/},
+ from => "description",
+ example => "https://pypi.python.org/pypi/stanford-corenlp-python/3.3.6-0",
+ expected => "A Stanford Core NLP wrapper (wordseer fork)",
+ },
+ gist => {
+ cleanup => qr/\s*-\s*Gist is .*$/,
+ domain => "gist.github.com",
+ from => ["og:title", "description"],
+ example => "https://gist.github.com/dgl/792206",
+ expected => "dgl/installblead: An install script that installs a development version of perl (from git) and keeps a particular set of modules installed. Sort of perlbrew for blead, but not quite.",
+ },
+ github => {
+ domain => "github.com",
+ items => [
+ { # issue, commit or pull
+ cleanup => qr/\s*·.*$/,
+ path => qr{^/[^/]+/[^/]+/(?:issues|commit|pull)/[a-f0-9]},
+ example => "https://github.com/irssi/irssi/commit/669add",
+ expected => "FS#155 hilight -tag",
+ },
+ { # user or project
+ from => "og:description",
+ path => qr{^/[^/]+(?:/[^/]+)?$},
+ example => "https://github.com/irssi/irssi",
+ expected => "irssi - The client of the future",
+ },
+ ],
+ },
+);
+
+# ------- Site handling
+
+sub expand {
+ my @expanded_sites;
+ for my $site(keys %SITES) {
+ expand_site(\@expanded_sites, $site, $SITES{$site}, {});
+ }
+ return @expanded_sites;
+}
+
+# This essentially implements inheritance (via the "items" key), to reduce
+# duplication.
+sub expand_site {
+ my($expanded_sites, $site, $site_data, $current) = @_;
+ delete $current->{items};
+ my $s = {
+ name => $site,
+ from => "title",
+ %$current,
+ %$site_data,
+ };
+ if (exists $s->{items}) {
+ expand_site($expanded_sites, $site, $_, $s) for @{$s->{items}};
+ } else {
+ push @$expanded_sites, $s;
+ }
+}
+
+sub _matcher {
+ my($site, $item) = @_;
+ return 1 unless defined $site;
+ return 1 if ref $site eq 'ARRAY' && grep _matcher($_, $item), @$site;
+ return 1 if ref $site && $site->isa("Regexp") && $item =~ $site;
+ return $site eq $item;
+}
+
+sub get_site {
+ my($sites, $url) = @_;
+
+ my $uri = URI->new($url);
+ $uri = URI->new("http://$url") unless $uri and $uri->scheme;
+ return unless $uri and $uri->can("host") and $uri->host and $uri->scheme =~ /^https?$/;
+
+ for my $site(@$sites) {
+ my $match = 1;
+ $match &&= _matcher($site->{domain}, $uri->host =~ s/^www\.//ri);
+ $match &&= _matcher($site->{$_}, $uri->$_) for qw(scheme host path query fragment);
+ return $site, $uri if $match;
+ }
+
+ if (Irssi::settings_get_bool("urlinfo_title_unknown")) {
+ return { name => "unknown", from => "title" }, $uri;
+ }
+
+ return;
+}
+
+my %from = (
+ title => sub {
+ $_[0]->look_down(_tag => 'title')->as_trimmed_text;
+ },
+ description => sub {
+ my $el = $_[0]->look_down(_tag => 'meta', name => 'description');
+ $el && $el->attr('content');
+ },
+ 'og:description' => sub {
+ my $el = $_[0]->look_down(_tag => 'meta', property => 'og:description');
+ $el && $el->attr('content');
+ },
+ 'og:title' => sub {
+ my $el = $_[0]->look_down(_tag => 'meta', property => 'og:title');
+ $el && $el->attr('content');
+ },
+);
+
+sub get_info {
+ my($site, $uri) = @_;
+ my $from = $site->{from};
+ my $tree = HTML::TreeBuilder->new_from_url($uri);
+ my $info;
+ if (!ref $from || ref $from eq 'ARRAY') {
+ $info = join ": ", grep defined, map $from{$_}->($tree), ref $from ? @$from : $from;
+ } else {
+ $info = join "", $tree->as_html =~ $from;
+ }
+ $info =~ s/$site->{cleanup}// if $site->{cleanup};
+ $info =~ s/([\x00-\x19])/sprintf "\\x%x", ord $1/ger;
+}
+
+# ------- IRC message handling
+
+# John Gruber's URL regexp (nicely handles people putting URLs in parens, etc)
+my $URL_RE = qr{((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))};
+
+my $pipe_in_progress;
+my @sites;
+my $timeout = 10;
+
+sub msg {
+ my($server, $text, $nick, undef, $target) = @_;
+ # TODO: Add a queue / multiple pipe support?
+ return if $pipe_in_progress;
+
+ my $msg_time = time;
+ my $tag = $server->{tag};
+ my $target = $target || $nick;
+
+ if (my($url) = $text =~ $URL_RE) {
+ my($site, $uri) = get_site(\@sites, $url);
+ return unless $site;
+ return if ignored($uri);
+
+ fork_wrapper(sub { # Child
+ my($fh) = @_;
+ syswrite $fh, " " . get_info($site, $uri);
+ },
+ sub { # Parent
+ my($in) = @_;
+ if ($in =~ s/^- //) {
+ print "\x{3}4urlinfo error:\x{3} $in";
+ return;
+ }
+ $in =~ s/^ //;
+ return unless $in;
+
+ # Avoid reusing server just in case it is no longer valid
+ my $server = Irssi::server_find_tag($tag);
+ my $win = find_window($server, $target);
+
+ my $view = $win->view;
+ my $line = $view->get_lines;
+ while ($line && ($line = $line->next)) {
+ if ($line->{info}->{time} >= $msg_time) {
+ if ($line->get_text(0) =~ /\Q$url/) {
+ last;
+ }
+ }
+ }
+
+ my $timestamp = POSIX::strftime(
+ Irssi::settings_get_str("timestamp_format"), localtime $msg_time);
+ # I'm sure I shouldn't have to care about colours here...
+ my $pad = length Irssi::strip_codes($timestamp);
+
+ my $text = $win->format_get_text(__PACKAGE__, $server, $target,
+ "urlinfo", " " x $pad, $in);
+ $win->print_after($line, MSGLEVEL_NO_ACT|MSGLEVEL_CLIENTCRAP,
+ $text, $msg_time);
+ $view->redraw;
+ });
+ }
+}
+
+sub ignored {
+ my($uri) = @_;
+ my @ignored = split / /, Irssi::settings_get_str('urlinfo_ignore_domains');
+ my $domain = $uri->host =~ s/^www\.//r;
+ return grep $domain =~ /^$_$/, @ignored;
+}
+
+sub find_window {
+ my($server, $target) = @_;
+ if (my $witem = $server->window_item_find($target)) {
+ return $witem->window;
+ } else {
+ # Maybe they have a msgs window?
+ my $win = Irssi::window_find_name("(msgs)");
+ # Ultimate fallback
+ $win = Irssi::window_find_refnum(1) unless $win;
+ return $win;
+ }
+}
+
+# Based on scriptassist.
+sub fork_wrapper {
+ my($child, $parent) = @_;
+
+ pipe(my $rfh, my $wfh);
+
+ my $pid = fork;
+ $pipe_in_progress = 1;
+
+ return unless defined $pid;
+
+ if ($pid) {
+ close $wfh;
+ Irssi::pidwait_add($pid);
+ my $pipetag;
+ my @args = ($rfh, \$pipetag, $parent);
+ $pipetag = Irssi::input_add(fileno($rfh), Irssi::INPUT_READ, \&pipe_input, \@args);
+ } else {
+ eval {
+ local $SIG{ALRM} = sub { die "Timed out\n" };
+ alarm $timeout;
+ $child->($wfh);
+ };
+ alarm 0;
+ syswrite $wfh, "- $@" if $@;
+ POSIX::_exit(1);
+ }
+}
+
+sub pipe_input {
+ my ($rfh, $pipetag, $parent) = @{$_[0]};
+ my $line = <$rfh>;
+ close($rfh);
+ Irssi::input_remove($$pipetag);
+ $pipe_in_progress = 0;
+ $parent->($line);
+}
+
+sub setup_changed {
+ $timeout = Irssi::settings_get_int("urlinfo_timeout");
+
+ @sites = expand();
+ for my $site (split / /, Irssi::settings_get_str("urlinfo_custom_domains")) {
+ next unless $site;
+
+ my($re, $from) = split /=/, $site;
+ $from ||= "title";
+ my($domain, $path) = split m{/}, $re, 2;
+ expand_site(\@sites, "custom", {
+ domain => qr/^$domain$/,
+ path => defined $path ? qr/^\/$path/ : undef,
+ from => $from,
+ }, {});
+ }
+}
+
+# ------- Initialization
+
+if (caller) {
+ # Irssi specific initialization
+ require Irssi::TextUI;
+
+ Irssi::settings_add_str($IRSSI{name}, "urlinfo_custom_domains", "");
+ Irssi::settings_add_str($IRSSI{name}, "urlinfo_ignore_domains", "");
+ Irssi::settings_add_int($IRSSI{name}, "urlinfo_timeout", $timeout);
+ Irssi::settings_add_bool($IRSSI{name}, "urlinfo_title_unknown", 0);
+
+ Irssi::signal_add("message irc action" => \&msg);
+ Irssi::signal_add("message private" => \&msg);
+ Irssi::signal_add("message public" => \&msg);
+
+ Irssi::signal_add_last("setup changed", \&setup_changed);
+ setup_changed();
+
+ Irssi::theme_register([
+ 'urlinfo' => '$0 %Kinfo:%n $1',
+ ]);
+
+} else {
+ # Built in test. Run this script outside Irssi to use.
+ @sites = expand();
+ for my $site(@sites) {
+ next unless $site->{example};
+ my($found_site, $uri) = get_site(\@sites, $site->{example});
+ if ($found_site != $site) {
+ die "Got $found_site->{name}, expected $site->{name}";
+ }
+ say "Get $uri";
+ my $result = get_info($site, $uri);
+ say $result;
+ die "Got $result, expected $site->{expected}" unless $result eq $site->{expected};
+ }
+ say "OK";
+}