Fix diff skipping lines (#13155)

* Fix diff skipping lines

Backport #13154

ParsePatch previously just skipped all lines that start with "+++ " or "--- "
and makes no attempt to see these lines in context.

This PR rewrites ParsePatch to pay attention to context and position
within a patch, ensuring that --- and +++ are only skipped if
appropriate.

This PR also fixes several issues with incomplete files.

Fix https://codeberg.org/Codeberg/Community/issues/308
Fix #13153

Signed-off-by: Andrew Thornton <art27@cantab.net>

* Add testcase

Signed-off-by: Andrew Thornton <art27@cantab.net>

* fix comment

* simplify error handling

Signed-off-by: Andrew Thornton <art27@cantab.net>

* never return io.EOF

Signed-off-by: Andrew Thornton <art27@cantab.net>

Co-authored-by: techknowlogick <techknowlogick@gitea.io>
This commit is contained in:
zeripath 2020-10-17 02:39:35 +01:00 committed by GitHub
parent ea95a9fa15
commit e9649b39ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 358 additions and 192 deletions

View File

@ -441,91 +441,252 @@ func (diff *Diff) LoadComments(issue *models.Issue, currentUser *models.User) er
const cmdDiffHead = "diff --git " const cmdDiffHead = "diff --git "
// ParsePatch builds a Diff object from a io.Reader and some // ParsePatch builds a Diff object from a io.Reader and some parameters.
// parameters.
// TODO: move this function to gogits/git-module
func ParsePatch(maxLines, maxLineCharacters, maxFiles int, reader io.Reader) (*Diff, error) { func ParsePatch(maxLines, maxLineCharacters, maxFiles int, reader io.Reader) (*Diff, error) {
var ( var curFile *DiffFile
diff = &Diff{Files: make([]*DiffFile, 0)}
curFile = &DiffFile{} diff := &Diff{Files: make([]*DiffFile, 0)}
curSection = &DiffSection{
Lines: make([]*DiffLine, 0, 10), sb := strings.Builder{}
// OK let's set a reasonable buffer size.
// This should be let's say at least the size of maxLineCharacters or 4096 whichever is larger.
readerSize := maxLineCharacters
if readerSize < 4096 {
readerSize = 4096
}
input := bufio.NewReaderSize(reader, readerSize)
line, err := input.ReadString('\n')
if err != nil {
if err == io.EOF {
return diff, nil
}
return diff, err
}
parsingLoop:
for {
// 1. A patch file always begins with `diff --git ` + `a/path b/path` (possibly quoted)
// if it does not we have bad input!
if !strings.HasPrefix(line, cmdDiffHead) {
return diff, fmt.Errorf("Invalid first file line: %s", line)
} }
leftLine, rightLine int // TODO: Handle skipping first n files
lineCount int if len(diff.Files) >= maxFiles {
curFileLinesCount int diff.IsIncomplete = true
curFileLFSPrefix bool _, err := io.Copy(ioutil.Discard, reader)
if err != nil {
// By the definition of io.Copy this never returns io.EOF
return diff, fmt.Errorf("Copy: %v", err)
}
break parsingLoop
}
curFile = createDiffFile(diff, line)
diff.Files = append(diff.Files, curFile)
// 2. It is followed by one or more extended header lines:
//
// old mode <mode>
// new mode <mode>
// deleted file mode <mode>
// new file mode <mode>
// copy from <path>
// copy to <path>
// rename from <path>
// rename to <path>
// similarity index <number>
// dissimilarity index <number>
// index <hash>..<hash> <mode>
//
// * <mode> 6-digit octal numbers including the file type and file permission bits.
// * <path> does not include the a/ and b/ prefixes
// * <number> percentage of unchanged lines for similarity, percentage of changed
// lines dissimilarity as integer rounded down with terminal %. 100% => equal files.
// * The index line includes the blob object names before and after the change.
// The <mode> is included if the file mode does not change; otherwise, separate
// lines indicate the old and the new mode.
// 3. Following this header the "standard unified" diff format header may be encountered: (but not for every case...)
//
// --- a/<path>
// +++ b/<path>
//
// With multiple hunks
//
// @@ <hunk descriptor> @@
// +added line
// -removed line
// unchanged line
//
// 4. Binary files get:
//
// Binary files a/<path> and b/<path> differ
//
// but one of a/<path> and b/<path> could be /dev/null.
curFileLoop:
for {
line, err = input.ReadString('\n')
if err != nil {
if err != io.EOF {
return diff, err
}
break parsingLoop
}
switch {
case strings.HasPrefix(line, "old mode ") ||
strings.HasPrefix(line, "new mode "):
if strings.HasSuffix(line, " 160000\n") {
curFile.IsSubmodule = true
}
case strings.HasPrefix(line, "copy from "):
curFile.IsRenamed = true
curFile.Type = DiffFileCopy
case strings.HasPrefix(line, "copy to "):
curFile.IsRenamed = true
curFile.Type = DiffFileCopy
case strings.HasPrefix(line, "new file"):
curFile.Type = DiffFileAdd
curFile.IsCreated = true
if strings.HasSuffix(line, " 160000\n") {
curFile.IsSubmodule = true
}
case strings.HasPrefix(line, "deleted"):
curFile.Type = DiffFileDel
curFile.IsDeleted = true
if strings.HasSuffix(line, " 160000\n") {
curFile.IsSubmodule = true
}
case strings.HasPrefix(line, "index"):
if strings.HasSuffix(line, " 160000\n") {
curFile.IsSubmodule = true
}
case strings.HasPrefix(line, "similarity index 100%"):
curFile.Type = DiffFileRename
case strings.HasPrefix(line, "Binary"):
curFile.IsBin = true
case strings.HasPrefix(line, "--- "):
// Do nothing with this line
case strings.HasPrefix(line, "+++ "):
// Do nothing with this line
lineBytes, isFragment, err := parseHunks(curFile, maxLines, maxLineCharacters, input)
diff.TotalAddition += curFile.Addition
diff.TotalDeletion += curFile.Deletion
if err != nil {
if err != io.EOF {
return diff, err
}
break parsingLoop
}
sb.Reset()
_, _ = sb.Write(lineBytes)
for isFragment {
lineBytes, isFragment, err = input.ReadLine()
if err != nil {
// Now by the definition of ReadLine this cannot be io.EOF
return diff, fmt.Errorf("Unable to ReadLine: %v", err)
}
_, _ = sb.Write(lineBytes)
}
line = sb.String()
sb.Reset()
break curFileLoop
}
}
}
// FIXME: There are numerous issues with this:
// - we might want to consider detecting encoding while parsing but...
// - we're likely to fail to get the correct encoding here anyway as we won't have enough information
// - and this doesn't really account for changes in encoding
var buf bytes.Buffer
for _, f := range diff.Files {
buf.Reset()
for _, sec := range f.Sections {
for _, l := range sec.Lines {
if l.Type == DiffLineSection {
continue
}
buf.WriteString(l.Content[1:])
buf.WriteString("\n")
}
}
charsetLabel, err := charset.DetectEncoding(buf.Bytes())
if charsetLabel != "UTF-8" && err == nil {
encoding, _ := stdcharset.Lookup(charsetLabel)
if encoding != nil {
d := encoding.NewDecoder()
for _, sec := range f.Sections {
for _, l := range sec.Lines {
if l.Type == DiffLineSection {
continue
}
if c, _, err := transform.String(d, l.Content[1:]); err == nil {
l.Content = l.Content[0:1] + c
}
}
}
}
}
}
diff.NumFiles = len(diff.Files)
return diff, nil
}
func parseHunks(curFile *DiffFile, maxLines, maxLineCharacters int, input *bufio.Reader) (lineBytes []byte, isFragment bool, err error) {
sb := strings.Builder{}
var (
curSection *DiffSection
curFileLinesCount int
curFileLFSPrefix bool
) )
input := bufio.NewReader(reader) leftLine, rightLine := 1, 1
isEOF := false
for !isEOF { for {
var linebuf bytes.Buffer sb.Reset()
for { lineBytes, isFragment, err = input.ReadLine()
b, err := input.ReadByte() if err != nil {
if err != nil { if err == io.EOF {
if err == io.EOF { return
isEOF = true
break
} else {
return nil, fmt.Errorf("ReadByte: %v", err)
}
} }
if b == '\n' { err = fmt.Errorf("Unable to ReadLine: %v", err)
break return
} }
if linebuf.Len() < maxLineCharacters { if lineBytes[0] == 'd' {
linebuf.WriteByte(b) // End of hunks
} else if linebuf.Len() == maxLineCharacters { return
}
switch lineBytes[0] {
case '@':
if curFileLinesCount >= maxLines {
curFile.IsIncomplete = true curFile.IsIncomplete = true
continue
} }
}
line := linebuf.String()
if strings.HasPrefix(line, "+++ ") || strings.HasPrefix(line, "--- ") || len(line) == 0 { _, _ = sb.Write(lineBytes)
continue for isFragment {
} // This is very odd indeed - we're in a section header and the line is too long
// This really shouldn't happen...
trimLine := strings.Trim(line, "+- ") lineBytes, isFragment, err = input.ReadLine()
if err != nil {
if trimLine == models.LFSMetaFileIdentifier { // Now by the definition of ReadLine this cannot be io.EOF
curFileLFSPrefix = true err = fmt.Errorf("Unable to ReadLine: %v", err)
} return
if curFileLFSPrefix && strings.HasPrefix(trimLine, models.LFSMetaFileOidPrefix) {
oid := strings.TrimPrefix(trimLine, models.LFSMetaFileOidPrefix)
if len(oid) == 64 {
m := &models.LFSMetaObject{Oid: oid}
count, err := models.Count(m)
if err == nil && count > 0 {
curFile.IsBin = true
curFile.IsLFSFile = true
curSection.Lines = nil
} }
_, _ = sb.Write(lineBytes)
} }
} line := sb.String()
curFileLinesCount++ // Create a new section to represent this hunk
lineCount++
// Diff data too large, we only show the first about maxLines lines
if curFileLinesCount >= maxLines {
curFile.IsIncomplete = true
}
switch {
case line[0] == ' ':
diffLine := &DiffLine{Type: DiffLinePlain, Content: line, LeftIdx: leftLine, RightIdx: rightLine}
leftLine++
rightLine++
curSection.Lines = append(curSection.Lines, diffLine)
curSection.FileName = curFile.Name
continue
case line[0] == '@':
curSection = &DiffSection{} curSection = &DiffSection{}
curFile.Sections = append(curFile.Sections, curSection) curFile.Sections = append(curFile.Sections, curSection)
lineSectionInfo := getDiffLineSectionInfo(curFile.Name, line, leftLine-1, rightLine-1) lineSectionInfo := getDiffLineSectionInfo(curFile.Name, line, leftLine-1, rightLine-1)
diffLine := &DiffLine{ diffLine := &DiffLine{
Type: DiffLineSection, Type: DiffLineSection,
@ -538,148 +699,132 @@ func ParsePatch(maxLines, maxLineCharacters, maxFiles int, reader io.Reader) (*D
leftLine = lineSectionInfo.LeftIdx leftLine = lineSectionInfo.LeftIdx
rightLine = lineSectionInfo.RightIdx rightLine = lineSectionInfo.RightIdx
continue continue
case line[0] == '+': case '\\':
if curFileLinesCount >= maxLines {
curFile.IsIncomplete = true
continue
}
// This is used only to indicate that the current file does not have a terminal newline
if !bytes.Equal(lineBytes, []byte("\\ No newline at end of file")) {
err = fmt.Errorf("Unexpected line in hunk: %s", string(lineBytes))
return
}
// Technically this should be the end the file!
// FIXME: we should be putting a marker at the end of the file if there is no terminal new line
continue
case '+':
curFileLinesCount++
curFile.Addition++ curFile.Addition++
diff.TotalAddition++ if curFileLinesCount >= maxLines {
diffLine := &DiffLine{Type: DiffLineAdd, Content: line, RightIdx: rightLine} curFile.IsIncomplete = true
continue
}
diffLine := &DiffLine{Type: DiffLineAdd, RightIdx: rightLine}
rightLine++ rightLine++
curSection.Lines = append(curSection.Lines, diffLine) curSection.Lines = append(curSection.Lines, diffLine)
curSection.FileName = curFile.Name case '-':
continue curFileLinesCount++
case line[0] == '-':
curFile.Deletion++ curFile.Deletion++
diff.TotalDeletion++ if curFileLinesCount >= maxLines {
diffLine := &DiffLine{Type: DiffLineDel, Content: line, LeftIdx: leftLine} curFile.IsIncomplete = true
continue
}
diffLine := &DiffLine{Type: DiffLineDel, LeftIdx: leftLine}
if leftLine > 0 { if leftLine > 0 {
leftLine++ leftLine++
} }
curSection.Lines = append(curSection.Lines, diffLine) curSection.Lines = append(curSection.Lines, diffLine)
curSection.FileName = curFile.Name case ' ':
case strings.HasPrefix(line, "Binary"): curFileLinesCount++
curFile.IsBin = true if curFileLinesCount >= maxLines {
continue curFile.IsIncomplete = true
continue
}
diffLine := &DiffLine{Type: DiffLinePlain, LeftIdx: leftLine, RightIdx: rightLine}
leftLine++
rightLine++
curSection.Lines = append(curSection.Lines, diffLine)
default:
// This is unexpected
err = fmt.Errorf("Unexpected line in hunk: %s", string(lineBytes))
return
} }
// Get new file. line := string(lineBytes)
if strings.HasPrefix(line, cmdDiffHead) { if isFragment {
if len(diff.Files) >= maxFiles { curFile.IsIncomplete = true
diff.IsIncomplete = true for isFragment {
_, err := io.Copy(ioutil.Discard, reader) lineBytes, isFragment, err = input.ReadLine()
if err != nil { if err != nil {
return nil, fmt.Errorf("Copy: %v", err) // Now by the definition of ReadLine this cannot be io.EOF
err = fmt.Errorf("Unable to ReadLine: %v", err)
return
} }
break
} }
}
curSection.Lines[len(curSection.Lines)-1].Content = line
// Note: In case file name is surrounded by double quotes (it happens only in git-shell). // handle LFS
// e.g. diff --git "a/xxx" "b/xxx" if line[1:] == models.LFSMetaFileIdentifier {
var a string curFileLFSPrefix = true
var b string } else if curFileLFSPrefix && strings.HasPrefix(line[1:], models.LFSMetaFileOidPrefix) {
oid := strings.TrimPrefix(line[1:], models.LFSMetaFileOidPrefix)
if len(oid) == 64 {
m := &models.LFSMetaObject{Oid: oid}
count, err := models.Count(m)
rd := strings.NewReader(line[len(cmdDiffHead):]) if err == nil && count > 0 {
char, _ := rd.ReadByte() curFile.IsBin = true
_ = rd.UnreadByte() curFile.IsLFSFile = true
if char == '"' { curSection.Lines = nil
fmt.Fscanf(rd, "%q ", &a)
if a[0] == '\\' {
a = a[1:]
}
} else {
fmt.Fscanf(rd, "%s ", &a)
}
char, _ = rd.ReadByte()
_ = rd.UnreadByte()
if char == '"' {
fmt.Fscanf(rd, "%q", &b)
if b[0] == '\\' {
b = b[1:]
}
} else {
fmt.Fscanf(rd, "%s", &b)
}
a = a[2:]
b = b[2:]
curFile = &DiffFile{
Name: b,
OldName: a,
Index: len(diff.Files) + 1,
Type: DiffFileChange,
Sections: make([]*DiffSection, 0, 10),
IsRenamed: a != b,
}
diff.Files = append(diff.Files, curFile)
curFileLinesCount = 0
leftLine = 1
rightLine = 1
curFileLFSPrefix = false
// Check file diff type and is submodule.
for {
line, err := input.ReadString('\n')
if err != nil {
if err == io.EOF {
isEOF = true
} else {
return nil, fmt.Errorf("ReadString: %v", err)
}
}
switch {
case strings.HasPrefix(line, "copy from "):
curFile.IsRenamed = true
curFile.Type = DiffFileCopy
case strings.HasPrefix(line, "copy to "):
curFile.IsRenamed = true
curFile.Type = DiffFileCopy
case strings.HasPrefix(line, "new file"):
curFile.Type = DiffFileAdd
curFile.IsCreated = true
case strings.HasPrefix(line, "deleted"):
curFile.Type = DiffFileDel
curFile.IsDeleted = true
case strings.HasPrefix(line, "index"):
curFile.Type = DiffFileChange
case strings.HasPrefix(line, "similarity index 100%"):
curFile.Type = DiffFileRename
}
if curFile.Type > 0 {
if strings.HasSuffix(line, " 160000\n") {
curFile.IsSubmodule = true
}
break
} }
} }
} }
} }
}
// FIXME: detect encoding while parsing. func createDiffFile(diff *Diff, line string) *DiffFile {
var buf bytes.Buffer // The a/ and b/ filenames are the same unless rename/copy is involved.
for _, f := range diff.Files { // Especially, even for a creation or a deletion, /dev/null is not used
buf.Reset() // in place of the a/ or b/ filenames.
for _, sec := range f.Sections { //
for _, l := range sec.Lines { // When rename/copy is involved, file1 and file2 show the name of the
buf.WriteString(l.Content) // source file of the rename/copy and the name of the file that rename/copy
buf.WriteString("\n") // produces, respectively.
} //
} // Path names are quoted if necessary.
charsetLabel, err := charset.DetectEncoding(buf.Bytes()) //
if charsetLabel != "UTF-8" && err == nil { // This means that you should always be able to determine the file name even when there
encoding, _ := stdcharset.Lookup(charsetLabel) // there is potential ambiguity...
if encoding != nil { //
d := encoding.NewDecoder() // but we can be simpler with our heuristics by just forcing git to prefix things nicely
for _, sec := range f.Sections { curFile := &DiffFile{
for _, l := range sec.Lines { Index: len(diff.Files) + 1,
if c, _, err := transform.String(d, l.Content); err == nil { Type: DiffFileChange,
l.Content = c Sections: make([]*DiffSection, 0, 10),
}
}
}
}
}
} }
diff.NumFiles = len(diff.Files)
return diff, nil rd := strings.NewReader(line[len(cmdDiffHead):] + " ")
curFile.Type = DiffFileChange
curFile.OldName = readFileName(rd)
curFile.Name = readFileName(rd)
curFile.IsRenamed = curFile.Name != curFile.OldName
return curFile
}
func readFileName(rd *strings.Reader) string {
var name string
char, _ := rd.ReadByte()
_ = rd.UnreadByte()
if char == '"' {
fmt.Fscanf(rd, "%q ", &name)
if name[0] == '\\' {
name = name[1:]
}
} else {
fmt.Fscanf(rd, "%s ", &name)
}
return name[2:]
} }
// GetDiffRange builds a Diff between two commits of a repository. // GetDiffRange builds a Diff between two commits of a repository.

View File

@ -182,6 +182,27 @@ rename to a b/a a/file b/b file
oldFilename: "a b/file b/a a/file", oldFilename: "a b/file b/a a/file",
filename: "a b/a a/file b/b file", filename: "a b/a a/file b/b file",
}, },
{
name: "minuses-and-pluses",
gitdiff: `diff --git a/minuses-and-pluses b/minuses-and-pluses
index 6961180..9ba1a00 100644
--- a/minuses-and-pluses
+++ b/minuses-and-pluses
@@ -1,4 +1,4 @@
--- 1st line
-++ 2nd line
--- 3rd line
-++ 4th line
+++ 1st line
+-- 2nd line
+++ 3rd line
+-- 4th line
`,
oldFilename: "minuses-and-pluses",
filename: "minuses-and-pluses",
addition: 4,
deletion: 4,
},
} }
for _, testcase := range tests { for _, testcase := range tests {