| // Copyright (C) 2016 The Android Open Source Project |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package com.google.gerrit.server.mail.receive; |
| |
| import com.google.common.base.Strings; |
| import com.google.common.collect.ImmutableList; |
| import com.google.common.collect.Iterators; |
| import com.google.common.collect.PeekingIterator; |
| import com.google.gerrit.reviewdb.client.Comment; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.List; |
| import org.jsoup.Jsoup; |
| import org.jsoup.nodes.Document; |
| import org.jsoup.nodes.Element; |
| |
| /** HTMLParser provides parsing functionality for html email. */ |
| public class HtmlParser { |
| private static ImmutableList<String> MAIL_PROVIDER_EXTRAS = |
| ImmutableList.of( |
| "gmail_extra", // "On 01/01/2017 User<user@gmail.com> wrote:" |
| "gmail_quote" // Used for quoting original content |
| ); |
| |
| /** |
| * Parses comments from html email. |
| * |
| * @param email MailMessage as received from the email service. |
| * @param comments A specific set of comments as sent out in the original notification email. |
| * Comments are expected to be in the same order as they were sent out to in the email |
| * @param changeUrl Canonical change URL that points to the change on this Gerrit instance. |
| * Example: https://go-review.googlesource.com/#/c/91570 |
| * @return List of MailComments parsed from the html part of the email. |
| */ |
| public static List<MailComment> parse( |
| MailMessage email, Collection<Comment> comments, String changeUrl) { |
| // TODO(hiesel) Add support for Gmail Mobile |
| // TODO(hiesel) Add tests for other popular email clients |
| |
| // This parser goes though all html elements in the email and checks for |
| // matching patterns. It keeps track of the last file and comments it |
| // encountered to know in which context a parsed comment belongs. |
| // It uses the href attributes of <a> tags to identify comments sent out by |
| // Gerrit as these are generally more reliable then the text captions. |
| List<MailComment> parsedComments = new ArrayList<>(); |
| Document d = Jsoup.parse(email.htmlContent()); |
| PeekingIterator<Comment> iter = Iterators.peekingIterator(comments.iterator()); |
| |
| String lastEncounteredFileName = null; |
| Comment lastEncounteredComment = null; |
| for (Element e : d.body().getAllElements()) { |
| String elementName = e.tagName(); |
| boolean isInBlockQuote = |
| e.parents().stream().filter(p -> p.tagName().equals("blockquote")).findAny().isPresent(); |
| |
| if (elementName.equals("a")) { |
| String href = e.attr("href"); |
| // Check if there is still a next comment that could be contained in |
| // this <a> tag |
| if (!iter.hasNext()) { |
| continue; |
| } |
| Comment perspectiveComment = iter.peek(); |
| if (href.equals(ParserUtil.filePath(changeUrl, perspectiveComment))) { |
| if (lastEncounteredFileName == null |
| || !lastEncounteredFileName.equals(perspectiveComment.key.filename)) { |
| // Not a file-level comment, but users could have typed a comment |
| // right after this file annotation to create a new file-level |
| // comment. If this file has a file-level comment, we have already |
| // set lastEncounteredComment to that file-level comment when we |
| // encountered the file link and should not reset it now. |
| lastEncounteredFileName = perspectiveComment.key.filename; |
| lastEncounteredComment = null; |
| } else if (perspectiveComment.lineNbr == 0) { |
| // This was originally a file-level comment |
| lastEncounteredComment = perspectiveComment; |
| iter.next(); |
| } |
| } else if (ParserUtil.isCommentUrl(href, changeUrl, perspectiveComment)) { |
| // This is a regular inline comment |
| lastEncounteredComment = perspectiveComment; |
| iter.next(); |
| } |
| } else if (!isInBlockQuote |
| && elementName.equals("div") |
| && !MAIL_PROVIDER_EXTRAS.contains(e.className())) { |
| // This is a comment typed by the user |
| // Replace non-breaking spaces and trim string |
| String content = e.ownText().replace('\u00a0', ' ').trim(); |
| if (!Strings.isNullOrEmpty(content)) { |
| if (lastEncounteredComment == null && lastEncounteredFileName == null) { |
| // Remove quotation line, email signature and |
| // "Sent from my xyz device" |
| content = ParserUtil.trimQuotation(content); |
| // TODO(hiesel) Add more sanitizer |
| if (!Strings.isNullOrEmpty(content)) { |
| parsedComments.add( |
| new MailComment(content, null, null, MailComment.CommentType.CHANGE_MESSAGE)); |
| } |
| } else if (lastEncounteredComment == null) { |
| parsedComments.add( |
| new MailComment( |
| content, lastEncounteredFileName, null, MailComment.CommentType.FILE_COMMENT)); |
| } else { |
| parsedComments.add( |
| new MailComment( |
| content, null, lastEncounteredComment, MailComment.CommentType.INLINE_COMMENT)); |
| } |
| } |
| } |
| } |
| return parsedComments; |
| } |
| } |