diff options
| author | tomaw | 2014-08-13 20:22:44 +0100 | 
|---|---|---|
| committer | tomaw | 2014-08-13 20:22:44 +0100 | 
| commit | d57d74d527b02d82ba2213cb89eae7076d822c79 (patch) | |
| tree | 619bf5f24d0d11887a41653d81b304cc756012e1 /scripts/urlinfo.pl | |
| parent | 52f49d1e7ddecd4cac063215101cd0a3c565d574 (diff) | |
| parent | 6739317e04c6c3579522d1551e29ebac2a8c7555 (diff) | |
| download | scripts.irssi.org-d57d74d527b02d82ba2213cb89eae7076d822c79.tar.bz2 | |
Merge pull request #30 from dgl/urlinfo
Add urlinfo.pl
Diffstat (limited to 'scripts/urlinfo.pl')
| -rw-r--r-- | scripts/urlinfo.pl | 397 | 
1 files changed, 397 insertions, 0 deletions
| diff --git a/scripts/urlinfo.pl b/scripts/urlinfo.pl new file mode 100644 index 0000000..8715ebb --- /dev/null +++ b/scripts/urlinfo.pl @@ -0,0 +1,397 @@ +use 5.014; +use utf8; +use Irssi; +use POSIX (); + +our $VERSION = "1.1"; +our %IRSSI = ( +    authors     => 'David Leadbeater', +    contact     => 'dgl@dgl.cx', +    name        => 'urlinfo', +    description => 'Print short summaries about URLs from known services that are mentioned on IRC. (Including YouTube, etc.)', +    license     => 'WTFPL <http://dgl.cx/licence>', +    url         => 'http://dgl.cx/irssi', +); + +# This is needed so it can still run standalone for testing. +BEGIN { +  Irssi->import(20140628) if __PACKAGE__ =~ /Irssi/; # This needs Irssi 0.8.17 +} + +BEGIN { +  eval { +    require HTML::TreeBuilder; +    require URI; +  } or do { +    print "\x{3}8You need to install HTML::TreeBuilder and URI"; +    if (-f "/etc/debian_version") { +      print "Try running: \x{3}9sudo apt-get install libhtml-treebuilder-perl libwww-perl"; +    } +    die $@; +  } +} + +# ------- Settings +# /SET urlinfo_title_unknown ON|OFF +#   Show title of all unknown sites. (There is a timeout to prevent against +#   obvious resource exhaustion attacks, but remember this script has no +#   warranty.) +# +# /SET urlinfo_timeout 10 +#   How many seconds after which to give up trying to fetch a URL +# +# /SET urlinfo_ignore_domains example\.org example\.com +#   Space separated list of regular expressions of domains to ignore +# +# /SET urlinfo_custom_domains my\.domain/thing irssi\.org=description +#   A limited way of configuring custom domains, if you need something more +#   complex edit SITES below. +#   Format: domain[/path[=from]] + +# ------- Sites configuration + +# This script aims to be data driven, this hash has "site_name => {site details}" +# site details is a hash reference which can contain: +#   cleanup: A regexp of text to remove from the resulting string +#   domain: A string (or regexp, with qr//) of the domain to match (www. is +#     removed automatically). +#   from: Where to read the info "title", "description" (meta description) or +#     a regexp to match the content (default "title") +#   example: Example of this URL +#   expected: What the example should return (see end for testing) +#   items: An array ref of additional hashes to allow multiple of these values +#     e.g.: items => [ { domain => "example.com" } ] +#   path: Path component (string or regexp) +# +my %SITES = ( +  vimeo => { +    cleanup => qr/\s*on Vimeo$/, +    domain => "vimeo.com", +    path => qr{/\d+}, +    example => "http://vimeo.com/80871338", +    expected => "Journey Part 1", +  }, +  youtube => { +    cleanup => qr/\s*-\s*YouTube$/, +    items => [ +      { +        domain => "youtu.be", +        example => "http://youtu.be/wa1c6EU2bY0", +        expected => "I Am The Resurrection (Remastered)", +      }, +      { +        domain => "youtube.com", +        path => "/watch", +        example => "https://www.youtube.com/watch?v=q99JgYrgzco&list=PLE57B71744156439A", +        expected => "I Wanna Be Adored (Remastered)", +      }, +    ], +  }, +  metacpan => { +    cleanup => qr/\s*-\s*metacpan\.org$/, +    domain => "metacpan.org", +    path => qr{^/pod/}, +    example => "https://metacpan.org/pod/release/DOY/Reply-0.34/lib/Reply.pm", +    from => "description", +    expected => "read, eval, print, loop, yay!", +  }, +  pypi => { +    domain => "pypi.python.org", +    path => qr{^/pypi/}, +    from => "description", +    example => "https://pypi.python.org/pypi/stanford-corenlp-python/3.3.6-0", +    expected => "A Stanford Core NLP wrapper (wordseer fork)", +  }, +  gist => { +    cleanup => qr/\s*-\s*Gist is .*$/, +    domain => "gist.github.com", +    from => ["og:title", "description"], +    example => "https://gist.github.com/dgl/792206", +    expected => "dgl/installblead: An install script that installs a development version of perl (from git) and keeps a particular set of modules installed. Sort of perlbrew for blead, but not quite.", +  }, +  github => { +    domain => "github.com", +    items => [ +      { # issue, commit or pull +        cleanup => qr/\s*·.*$/, +        path => qr{^/[^/]+/[^/]+/(?:issues|commit|pull)/[a-f0-9]}, +        example => "https://github.com/irssi/irssi/commit/669add", +        expected => "FS#155 hilight -tag", +      }, +      { # user or project +        from => "og:description", +        path => qr{^/[^/]+(?:/[^/]+)?$}, +        example => "https://github.com/irssi/irssi", +        expected => "irssi - The client of the future", +      }, +    ], +  }, +); + +# ------- Site handling + +sub expand { +  my @expanded_sites; +  for my $site(keys %SITES) { +    expand_site(\@expanded_sites, $site, $SITES{$site}, {}); +  } +  return @expanded_sites; +} + +# This essentially implements inheritance (via the "items" key), to reduce +# duplication. +sub expand_site { +  my($expanded_sites, $site, $site_data, $current) = @_; +  delete $current->{items}; +  my $s = { +    name => $site, +    from => "title", +    %$current, +    %$site_data, +  }; +  if (exists $s->{items}) { +    expand_site($expanded_sites, $site, $_, $s) for @{$s->{items}}; +  } else { +    push @$expanded_sites, $s; +  } +} + +sub _matcher { +  my($site, $item) = @_; +  return 1 unless defined $site; +  return 1 if ref $site eq 'ARRAY' && grep _matcher($_, $item), @$site; +  return 1 if ref $site && $site->isa("Regexp") && $item =~ $site; +  return $site eq $item; +} + +sub get_site { +  my($sites, $url) = @_; + +  my $uri = URI->new($url); +  $uri = URI->new("http://$url") unless $uri and $uri->scheme; +  return unless $uri and $uri->can("host") and $uri->host and $uri->scheme =~ /^https?$/; +   +  for my $site(@$sites) { +    my $match = 1; +    $match &&= _matcher($site->{domain}, $uri->host =~ s/^www\.//ri); +    $match &&= _matcher($site->{$_}, $uri->$_) for qw(scheme host path query fragment); +    return $site, $uri if $match; +  } + +  if (Irssi::settings_get_bool("urlinfo_title_unknown")) { +    return { name => "unknown", from => "title" }, $uri; +  } + +  return; +} + +my %from = ( +  title => sub { +    $_[0]->look_down(_tag => 'title')->as_trimmed_text; +  }, +  description => sub { +    my $el = $_[0]->look_down(_tag => 'meta', name => 'description'); +    $el && $el->attr('content'); +  }, +  'og:description' => sub { +    my $el = $_[0]->look_down(_tag => 'meta', property => 'og:description'); +    $el && $el->attr('content'); +  }, +  'og:title' => sub { +    my $el = $_[0]->look_down(_tag => 'meta', property => 'og:title'); +    $el && $el->attr('content'); +  }, +); + +sub get_info { +  my($site, $uri) = @_; +  my $from = $site->{from}; +  my $tree = HTML::TreeBuilder->new_from_url($uri); +  my $info; +  if (!ref $from || ref $from eq 'ARRAY') { +    $info = join ": ", grep defined, map $from{$_}->($tree), ref $from ? @$from : $from; +  } else { +    $info = join "", $tree->as_html =~ $from; +  } +  $info =~ s/$site->{cleanup}// if $site->{cleanup}; +  $info =~ s/([\x00-\x19])/sprintf "\\x%x", ord $1/ger; +} + +# ------- IRC message handling + +# John Gruber's URL regexp (nicely handles people putting URLs in parens, etc) +my $URL_RE = qr{((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))}; + +my $pipe_in_progress; +my @sites; +my $timeout = 10; + +sub msg { +  my($server, $text, $nick, undef, $target) = @_; +  # TODO: Add a queue / multiple pipe support? +  return if $pipe_in_progress; + +  my $msg_time = time; +  my $tag = $server->{tag}; +  my $target = $target || $nick; + +  if (my($url) = $text =~ $URL_RE) { +    my($site, $uri) = get_site(\@sites, $url); +    return unless $site; +    return if ignored($uri); + +    fork_wrapper(sub { # Child +      my($fh) = @_; +      syswrite $fh, "  " . get_info($site, $uri); +    }, +    sub { # Parent +      my($in) = @_; +      if ($in =~ s/^- //) { +        print "\x{3}4urlinfo error:\x{3} $in"; +        return; +      } +      $in =~ s/^  //; +      return unless $in; + +      # Avoid reusing server just in case it is no longer valid +      my $server = Irssi::server_find_tag($tag); +      my $win = find_window($server, $target); + +      my $view = $win->view; +      my $line = $view->get_lines; +      while ($line && ($line = $line->next)) { +        if ($line->{info}->{time} >= $msg_time) { +          if ($line->get_text(0) =~ /\Q$url/) { +            last; +          } +        } +      } + +      my $timestamp = POSIX::strftime( +        Irssi::settings_get_str("timestamp_format"), localtime $msg_time); +      # I'm sure I shouldn't have to care about colours here... +      my $pad = length Irssi::strip_codes($timestamp); + +      my $text = $win->format_get_text(__PACKAGE__, $server, $target, +        "urlinfo", " " x $pad, $in); +      $win->print_after($line, MSGLEVEL_NO_ACT|MSGLEVEL_CLIENTCRAP, +        $text, $msg_time); +      $view->redraw; +    }); +  } +} + +sub ignored { +  my($uri) = @_; +  my @ignored = split / /, Irssi::settings_get_str('urlinfo_ignore_domains'); +  my $domain = $uri->host =~ s/^www\.//r; +  return grep $domain =~ /^$_$/, @ignored; +} + +sub find_window { +  my($server, $target) = @_; +  if (my $witem = $server->window_item_find($target)) { +    return $witem->window; +  } else { +    # Maybe they have a msgs window? +    my $win = Irssi::window_find_name("(msgs)"); +    # Ultimate fallback +    $win = Irssi::window_find_refnum(1) unless $win; +    return $win; +  } +} + +# Based on scriptassist. +sub fork_wrapper { +  my($child, $parent) = @_; + +  pipe(my $rfh, my $wfh); + +  my $pid = fork; +  $pipe_in_progress = 1; + +  return unless defined $pid; + +  if ($pid) { +    close $wfh; +    Irssi::pidwait_add($pid); +    my $pipetag; +    my @args = ($rfh, \$pipetag, $parent); +    $pipetag = Irssi::input_add(fileno($rfh), Irssi::INPUT_READ, \&pipe_input, \@args); +  } else { +    eval { +      local $SIG{ALRM} = sub { die "Timed out\n" }; +      alarm $timeout; +      $child->($wfh); +    }; +    alarm 0; +    syswrite $wfh, "- $@" if $@; +    POSIX::_exit(1); +  } +} + +sub pipe_input { +  my ($rfh, $pipetag, $parent) = @{$_[0]}; +  my $line = <$rfh>; +  close($rfh); +  Irssi::input_remove($$pipetag); +  $pipe_in_progress = 0; +  $parent->($line); +} + +sub setup_changed { +  $timeout = Irssi::settings_get_int("urlinfo_timeout"); + +  @sites = expand(); +  for my $site (split / /, Irssi::settings_get_str("urlinfo_custom_domains")) { +    next unless $site; + +    my($re, $from) = split /=/, $site; +    $from ||= "title"; +    my($domain, $path) = split m{/}, $re, 2; +    expand_site(\@sites, "custom", { +        domain => qr/^$domain$/, +        path => defined $path ? qr/^\/$path/ : undef, +        from => $from, +    }, {}); +  } +} + +# ------- Initialization + +if (caller) { +  # Irssi specific initialization +  require Irssi::TextUI; + +  Irssi::settings_add_str($IRSSI{name}, "urlinfo_custom_domains", ""); +  Irssi::settings_add_str($IRSSI{name}, "urlinfo_ignore_domains", ""); +  Irssi::settings_add_int($IRSSI{name}, "urlinfo_timeout", $timeout); +  Irssi::settings_add_bool($IRSSI{name}, "urlinfo_title_unknown", 0); + +  Irssi::signal_add("message irc action" => \&msg); +  Irssi::signal_add("message private" => \&msg); +  Irssi::signal_add("message public" => \&msg); + +  Irssi::signal_add_last("setup changed", \&setup_changed); +  setup_changed(); + +  Irssi::theme_register([ +    'urlinfo' => '$0 %Kinfo:%n $1', +  ]); + +} else { +  # Built in test. Run this script outside Irssi to use. +  @sites = expand(); +  for my $site(@sites) { +    next unless $site->{example}; +    my($found_site, $uri) = get_site(\@sites, $site->{example}); +    if ($found_site != $site) { +      die "Got $found_site->{name}, expected $site->{name}"; +    } +    say "Get $uri"; +    my $result = get_info($site, $uri); +    say $result; +    die "Got $result, expected $site->{expected}" unless $result eq $site->{expected}; +  } +  say "OK"; +} | 
